; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s ; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s ; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s ; Test using saddr addressing mode of flat_* atomic instructions. Make ; sure these are not incorrectly selected before gfx1250. define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst ret void } ; Maximum positive offset on gfx10 define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:2047 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:2047 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047 %unused = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst ret void } ; Maximum negative offset on gfx10 define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xfffff800, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 %unused = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_2048: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:2048 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn_2048: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:2048 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2048 %rtn = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_neg2048: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn_neg2048: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xfffff800, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 %rtn = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } ; -------------------------------------------------------------------------------- ; Uniformity edge cases ; -------------------------------------------------------------------------------- @ptr.in.lds = internal addrspace(3) global ptr undef ; Base pointer is uniform, but also in VGPRs define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } ; Base pointer is uniform, but also in VGPRs, with imm offset define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %voffset, i32 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: ; return to shader part epilog ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:42 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:42 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42 %rtn = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } ; Base pointer is uniform, but also in VGPRs define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, i32 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst ret void } ; Base pointer is uniform, but also in VGPRs, with imm offset define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 %voffset, i32 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 offset:42 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:42 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:42 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42 %unused = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst ret void } ; -------------------------------------------------------------------------------- ; All atomicrmw ops ; -------------------------------------------------------------------------------- ; -------------------------------------------------------------------------------- ; atomicrmw xchg ; -------------------------------------------------------------------------------- define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1250-SDAG-NEXT: .LBB10_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB10_5 ; GFX1250-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB10_5 ; GFX1250-SDAG-NEXT: .LBB10_5: ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_4 ; GFX1250-GISEL-NEXT: .LBB10_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB10_5 ; GFX1250-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB10_5 ; GFX1250-GISEL-NEXT: .LBB10_5: ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB10_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB10_4 ; GFX950-SDAG-NEXT: .LBB10_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB10_5 ; GFX950-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB10_2 ; GFX950-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_nop 0 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB10_5 ; GFX950-SDAG-NEXT: .LBB10_5: ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB10_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB10_4 ; GFX950-GISEL-NEXT: .LBB10_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB10_5 ; GFX950-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX950-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[4:5], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB10_5 ; GFX950-GISEL-NEXT: .LBB10_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1250-SDAG-NEXT: .LBB11_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB11_5 ; GFX1250-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB11_5 ; GFX1250-SDAG-NEXT: .LBB11_5: ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_4 ; GFX1250-GISEL-NEXT: .LBB11_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB11_5 ; GFX1250-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB11_5 ; GFX1250-GISEL-NEXT: .LBB11_5: ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB11_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB11_4 ; GFX950-SDAG-NEXT: .LBB11_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB11_5 ; GFX950-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB11_2 ; GFX950-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_nop 0 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB11_5 ; GFX950-SDAG-NEXT: .LBB11_5: ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB11_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB11_4 ; GFX950-GISEL-NEXT: .LBB11_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB11_5 ; GFX950-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX950-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[4:5], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB11_5 ; GFX950-GISEL-NEXT: .LBB11_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw xchg ptr %gep1, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_4 ; GFX1250-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_4 ; GFX1250-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB12_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB12_4 ; GFX950-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB12_2 ; GFX950-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB12_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB12_4 ; GFX950-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX950-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_4 ; GFX1250-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_4 ; GFX1250-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB13_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB13_4 ; GFX950-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB13_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB13_4 ; GFX950-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX950-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw xchg ptr %gep1, i64 %data syncscope("agent") seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw add ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_add_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_add_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw add ptr %gep1, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_add_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_add_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_add_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_add_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw add ptr %gep1, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_4 ; GFX1250-SDAG-NEXT: .LBB18_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB18_5 ; GFX1250-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB18_5 ; GFX1250-SDAG-NEXT: .LBB18_5: ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_4 ; GFX1250-GISEL-NEXT: .LBB18_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB18_5 ; GFX1250-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB18_5 ; GFX1250-GISEL-NEXT: .LBB18_5: ; ; GFX950-SDAG-LABEL: flat_add_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB18_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB18_4 ; GFX950-SDAG-NEXT: .LBB18_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB18_5 ; GFX950-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB18_2 ; GFX950-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB18_5 ; GFX950-SDAG-NEXT: .LBB18_5: ; ; GFX950-GISEL-LABEL: flat_add_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB18_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB18_4 ; GFX950-GISEL-NEXT: .LBB18_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB18_5 ; GFX950-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB18_2 ; GFX950-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v0, v4 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB18_5 ; GFX950-GISEL-NEXT: .LBB18_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_4 ; GFX1250-SDAG-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB19_5 ; GFX1250-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB19_5 ; GFX1250-SDAG-NEXT: .LBB19_5: ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_4 ; GFX1250-GISEL-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB19_5 ; GFX1250-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB19_5 ; GFX1250-GISEL-NEXT: .LBB19_5: ; ; GFX950-SDAG-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB19_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB19_4 ; GFX950-SDAG-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB19_5 ; GFX950-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB19_2 ; GFX950-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB19_5 ; GFX950-SDAG-NEXT: .LBB19_5: ; ; GFX950-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB19_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB19_4 ; GFX950-GISEL-NEXT: .LBB19_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB19_5 ; GFX950-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB19_2 ; GFX950-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v0, v4 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB19_5 ; GFX950-GISEL-NEXT: .LBB19_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw add ptr %gep1, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_4 ; GFX1250-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_4 ; GFX1250-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_add_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB20_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB20_4 ; GFX950-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB20_2 ; GFX950-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_add_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB20_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB20_4 ; GFX950-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB20_2 ; GFX950-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_4 ; GFX1250-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_4 ; GFX1250-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB21_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB21_4 ; GFX950-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB21_2 ; GFX950-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB21_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB21_4 ; GFX950-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GFX950-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw add ptr %gep1, i64 %data syncscope("agent") seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw sub ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_sub_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_sub_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw sub ptr %gep1, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_sub_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_sub_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw sub ptr %gep1, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_4 ; GFX1250-SDAG-NEXT: .LBB26_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB26_5 ; GFX1250-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB26_5 ; GFX1250-SDAG-NEXT: .LBB26_5: ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_4 ; GFX1250-GISEL-NEXT: .LBB26_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB26_5 ; GFX1250-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB26_5 ; GFX1250-GISEL-NEXT: .LBB26_5: ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB26_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB26_4 ; GFX950-SDAG-NEXT: .LBB26_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB26_5 ; GFX950-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB26_2 ; GFX950-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB26_5 ; GFX950-SDAG-NEXT: .LBB26_5: ; ; GFX950-GISEL-LABEL: flat_sub_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB26_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB26_4 ; GFX950-GISEL-NEXT: .LBB26_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB26_5 ; GFX950-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB26_2 ; GFX950-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB26_5 ; GFX950-GISEL-NEXT: .LBB26_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_4 ; GFX1250-SDAG-NEXT: .LBB27_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB27_5 ; GFX1250-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB27_5 ; GFX1250-SDAG-NEXT: .LBB27_5: ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_4 ; GFX1250-GISEL-NEXT: .LBB27_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB27_5 ; GFX1250-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB27_5 ; GFX1250-GISEL-NEXT: .LBB27_5: ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB27_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB27_4 ; GFX950-SDAG-NEXT: .LBB27_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB27_5 ; GFX950-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB27_2 ; GFX950-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB27_5 ; GFX950-SDAG-NEXT: .LBB27_5: ; ; GFX950-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB27_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB27_4 ; GFX950-GISEL-NEXT: .LBB27_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB27_5 ; GFX950-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB27_2 ; GFX950-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB27_5 ; GFX950-GISEL-NEXT: .LBB27_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw sub ptr %gep1, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_4 ; GFX1250-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_4 ; GFX1250-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB28_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB28_4 ; GFX950-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB28_2 ; GFX950-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_sub_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB28_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB28_4 ; GFX950-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB28_2 ; GFX950-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_4 ; GFX1250-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_4 ; GFX1250-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB29_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB29_4 ; GFX950-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB29_2 ; GFX950-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB29_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB29_4 ; GFX950-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB29_2 ; GFX950-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw sub ptr %gep1, i64 %data syncscope("agent") seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw and ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_and_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_and_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw and ptr %gep1, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_and_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_and_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_and_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_and_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw and ptr %gep1, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_4 ; GFX1250-SDAG-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB34_5 ; GFX1250-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB34_5 ; GFX1250-SDAG-NEXT: .LBB34_5: ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_4 ; GFX1250-GISEL-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB34_5 ; GFX1250-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB34_5 ; GFX1250-GISEL-NEXT: .LBB34_5: ; ; GFX950-SDAG-LABEL: flat_and_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB34_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB34_4 ; GFX950-SDAG-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB34_5 ; GFX950-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB34_2 ; GFX950-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX950-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB34_5 ; GFX950-SDAG-NEXT: .LBB34_5: ; ; GFX950-GISEL-LABEL: flat_and_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB34_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB34_4 ; GFX950-GISEL-NEXT: .LBB34_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB34_5 ; GFX950-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB34_2 ; GFX950-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 ; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB34_5 ; GFX950-GISEL-NEXT: .LBB34_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_4 ; GFX1250-SDAG-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB35_5 ; GFX1250-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB35_5 ; GFX1250-SDAG-NEXT: .LBB35_5: ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_4 ; GFX1250-GISEL-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB35_5 ; GFX1250-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB35_5 ; GFX1250-GISEL-NEXT: .LBB35_5: ; ; GFX950-SDAG-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB35_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB35_4 ; GFX950-SDAG-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB35_5 ; GFX950-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB35_2 ; GFX950-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX950-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB35_5 ; GFX950-SDAG-NEXT: .LBB35_5: ; ; GFX950-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB35_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB35_4 ; GFX950-GISEL-NEXT: .LBB35_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB35_5 ; GFX950-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB35_2 ; GFX950-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 ; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB35_5 ; GFX950-GISEL-NEXT: .LBB35_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw and ptr %gep1, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_4 ; GFX1250-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_4 ; GFX1250-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_and_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB36_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB36_4 ; GFX950-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB36_2 ; GFX950-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX950-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_and_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB36_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB36_4 ; GFX950-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB36_2 ; GFX950-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX950-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_4 ; GFX1250-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_4 ; GFX1250-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB37_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB37_4 ; GFX950-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB37_2 ; GFX950-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX950-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB37_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB37_4 ; GFX950-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB37_2 ; GFX950-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX950-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw and ptr %gep1, i64 %data syncscope("agent") seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw or ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_or_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_or_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw or ptr %gep1, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_or_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_or_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_or_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_or_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw or ptr %gep1, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_4 ; GFX1250-SDAG-NEXT: .LBB42_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB42_5 ; GFX1250-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB42_5 ; GFX1250-SDAG-NEXT: .LBB42_5: ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_4 ; GFX1250-GISEL-NEXT: .LBB42_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB42_5 ; GFX1250-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB42_5 ; GFX1250-GISEL-NEXT: .LBB42_5: ; ; GFX950-SDAG-LABEL: flat_or_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB42_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB42_4 ; GFX950-SDAG-NEXT: .LBB42_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB42_5 ; GFX950-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB42_2 ; GFX950-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB42_5 ; GFX950-SDAG-NEXT: .LBB42_5: ; ; GFX950-GISEL-LABEL: flat_or_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB42_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB42_4 ; GFX950-GISEL-NEXT: .LBB42_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB42_5 ; GFX950-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB42_2 ; GFX950-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 ; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB42_5 ; GFX950-GISEL-NEXT: .LBB42_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_4 ; GFX1250-SDAG-NEXT: .LBB43_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB43_5 ; GFX1250-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB43_5 ; GFX1250-SDAG-NEXT: .LBB43_5: ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_4 ; GFX1250-GISEL-NEXT: .LBB43_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB43_5 ; GFX1250-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB43_5 ; GFX1250-GISEL-NEXT: .LBB43_5: ; ; GFX950-SDAG-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB43_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB43_4 ; GFX950-SDAG-NEXT: .LBB43_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB43_5 ; GFX950-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB43_2 ; GFX950-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB43_5 ; GFX950-SDAG-NEXT: .LBB43_5: ; ; GFX950-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB43_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB43_4 ; GFX950-GISEL-NEXT: .LBB43_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB43_5 ; GFX950-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB43_2 ; GFX950-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 ; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB43_5 ; GFX950-GISEL-NEXT: .LBB43_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw or ptr %gep1, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_4 ; GFX1250-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_4 ; GFX1250-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_or_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB44_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB44_4 ; GFX950-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB44_2 ; GFX950-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX950-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_or_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB44_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB44_4 ; GFX950-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB44_2 ; GFX950-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX950-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_4 ; GFX1250-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_4 ; GFX1250-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB45_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB45_4 ; GFX950-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB45_2 ; GFX950-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX950-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB45_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB45_4 ; GFX950-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB45_2 ; GFX950-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX950-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw or ptr %gep1, i64 %data syncscope("agent") seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw xor ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_xor_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_xor_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw xor ptr %gep1, i32 %data syncscope("agent") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xor_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xor_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw xor ptr %gep1, i32 %data syncscope("agent") seq_cst ret void } define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_4 ; GFX1250-SDAG-NEXT: .LBB50_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB50_5 ; GFX1250-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB50_5 ; GFX1250-SDAG-NEXT: .LBB50_5: ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_4 ; GFX1250-GISEL-NEXT: .LBB50_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB50_5 ; GFX1250-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB50_5 ; GFX1250-GISEL-NEXT: .LBB50_5: ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB50_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB50_4 ; GFX950-SDAG-NEXT: .LBB50_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB50_5 ; GFX950-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB50_2 ; GFX950-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX950-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB50_5 ; GFX950-SDAG-NEXT: .LBB50_5: ; ; GFX950-GISEL-LABEL: flat_xor_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB50_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB50_4 ; GFX950-GISEL-NEXT: .LBB50_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB50_5 ; GFX950-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB50_2 ; GFX950-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 ; GFX950-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB50_5 ; GFX950-GISEL-NEXT: .LBB50_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_4 ; GFX1250-SDAG-NEXT: .LBB51_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB51_5 ; GFX1250-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB51_5 ; GFX1250-SDAG-NEXT: .LBB51_5: ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_4 ; GFX1250-GISEL-NEXT: .LBB51_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB51_5 ; GFX1250-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB51_5 ; GFX1250-GISEL-NEXT: .LBB51_5: ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB51_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB51_4 ; GFX950-SDAG-NEXT: .LBB51_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB51_5 ; GFX950-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB51_2 ; GFX950-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 ; GFX950-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB51_5 ; GFX950-SDAG-NEXT: .LBB51_5: ; ; GFX950-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB51_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB51_4 ; GFX950-GISEL-NEXT: .LBB51_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB51_5 ; GFX950-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB51_2 ; GFX950-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 ; GFX950-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB51_5 ; GFX950-GISEL-NEXT: .LBB51_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw xor ptr %gep1, i64 %data syncscope("agent") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_4 ; GFX1250-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_4 ; GFX1250-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB52_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB52_4 ; GFX950-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB52_2 ; GFX950-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX950-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xor_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB52_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB52_4 ; GFX950-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB52_2 ; GFX950-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX950-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst ret void } define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_4 ; GFX1250-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_4 ; GFX1250-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB53_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB53_4 ; GFX950-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc1 ; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB53_2 ; GFX950-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX950-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB53_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB53_4 ; GFX950-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc1 ; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB53_2 ; GFX950-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX950-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw xor ptr %gep1, i64 %data syncscope("agent") seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw max ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_smax v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_max_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smax v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_smax v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_max_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smax v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw max ptr %gep1, i32 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw max ptr %gep1, i32 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB58_5 ; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_i64 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB58_5 ; GFX1250-SDAG-NEXT: .LBB58_5: ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4 ; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB58_5 ; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_i64 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB58_5 ; GFX1250-GISEL-NEXT: .LBB58_5: ; ; GFX950-SDAG-LABEL: flat_max_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_4 ; GFX950-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB58_5 ; GFX950-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX950-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB58_5 ; GFX950-SDAG-NEXT: .LBB58_5: ; ; GFX950-GISEL-LABEL: flat_max_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_4 ; GFX950-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB58_5 ; GFX950-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX950-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB58_5 ; GFX950-GISEL-NEXT: .LBB58_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB59_5 ; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_i64 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB59_5 ; GFX1250-SDAG-NEXT: .LBB59_5: ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4 ; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB59_5 ; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_i64 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB59_5 ; GFX1250-GISEL-NEXT: .LBB59_5: ; ; GFX950-SDAG-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_4 ; GFX950-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB59_5 ; GFX950-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX950-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB59_5 ; GFX950-SDAG-NEXT: .LBB59_5: ; ; GFX950-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_4 ; GFX950-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB59_5 ; GFX950-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX950-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB59_5 ; GFX950-GISEL-NEXT: .LBB59_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw max ptr %gep1, i64 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_4 ; GFX1250-SDAG-NEXT: .LBB60_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_4 ; GFX1250-GISEL-NEXT: .LBB60_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB60_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB60_4 ; GFX950-SDAG-NEXT: .LBB60_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB60_2 ; GFX950-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_max_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB60_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB60_4 ; GFX950-GISEL-NEXT: .LBB60_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB60_2 ; GFX950-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_4 ; GFX1250-SDAG-NEXT: .LBB61_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_4 ; GFX1250-GISEL-NEXT: .LBB61_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB61_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB61_4 ; GFX950-SDAG-NEXT: .LBB61_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB61_2 ; GFX950-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB61_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB61_4 ; GFX950-GISEL-NEXT: .LBB61_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB61_2 ; GFX950-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw max ptr %gep1, i64 %data syncscope("workgroup") seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw min ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_smin v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_min_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smin v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_smin v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_min_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smin v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw min ptr %gep1, i32 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw min ptr %gep1, i32 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB66_5 ; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_i64 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB66_5 ; GFX1250-SDAG-NEXT: .LBB66_5: ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4 ; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB66_5 ; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_i64 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB66_5 ; GFX1250-GISEL-NEXT: .LBB66_5: ; ; GFX950-SDAG-LABEL: flat_min_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_4 ; GFX950-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB66_5 ; GFX950-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX950-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB66_5 ; GFX950-SDAG-NEXT: .LBB66_5: ; ; GFX950-GISEL-LABEL: flat_min_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_4 ; GFX950-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB66_5 ; GFX950-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX950-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB66_5 ; GFX950-GISEL-NEXT: .LBB66_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB67_5 ; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_i64 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB67_5 ; GFX1250-SDAG-NEXT: .LBB67_5: ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4 ; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB67_5 ; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_i64 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB67_5 ; GFX1250-GISEL-NEXT: .LBB67_5: ; ; GFX950-SDAG-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_4 ; GFX950-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB67_5 ; GFX950-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX950-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB67_5 ; GFX950-SDAG-NEXT: .LBB67_5: ; ; GFX950-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_4 ; GFX950-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB67_5 ; GFX950-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX950-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB67_5 ; GFX950-GISEL-NEXT: .LBB67_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw min ptr %gep1, i64 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_4 ; GFX1250-SDAG-NEXT: .LBB68_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_4 ; GFX1250-GISEL-NEXT: .LBB68_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB68_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB68_4 ; GFX950-SDAG-NEXT: .LBB68_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB68_2 ; GFX950-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_min_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB68_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB68_4 ; GFX950-GISEL-NEXT: .LBB68_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB68_2 ; GFX950-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_4 ; GFX1250-SDAG-NEXT: .LBB69_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_4 ; GFX1250-GISEL-NEXT: .LBB69_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB69_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB69_4 ; GFX950-SDAG-NEXT: .LBB69_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB69_2 ; GFX950-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB69_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB69_4 ; GFX950-GISEL-NEXT: .LBB69_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB69_2 ; GFX950-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw min ptr %gep1, i64 %data syncscope("workgroup") seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw umax ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_umax v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umax v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_umax v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umax v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw umax ptr %gep1, i32 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw umax ptr %gep1, i32 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB74_5 ; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_u64 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB74_5 ; GFX1250-SDAG-NEXT: .LBB74_5: ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4 ; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB74_5 ; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_u64 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB74_5 ; GFX1250-GISEL-NEXT: .LBB74_5: ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_4 ; GFX950-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB74_5 ; GFX950-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX950-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB74_5 ; GFX950-SDAG-NEXT: .LBB74_5: ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_4 ; GFX950-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB74_5 ; GFX950-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX950-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB74_5 ; GFX950-GISEL-NEXT: .LBB74_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB75_5 ; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_u64 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB75_5 ; GFX1250-SDAG-NEXT: .LBB75_5: ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4 ; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB75_5 ; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_u64 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB75_5 ; GFX1250-GISEL-NEXT: .LBB75_5: ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_4 ; GFX950-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB75_5 ; GFX950-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX950-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB75_5 ; GFX950-SDAG-NEXT: .LBB75_5: ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_4 ; GFX950-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB75_5 ; GFX950-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX950-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB75_5 ; GFX950-GISEL-NEXT: .LBB75_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw umax ptr %gep1, i64 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_4 ; GFX1250-SDAG-NEXT: .LBB76_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_4 ; GFX1250-GISEL-NEXT: .LBB76_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB76_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB76_4 ; GFX950-SDAG-NEXT: .LBB76_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB76_2 ; GFX950-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB76_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB76_4 ; GFX950-GISEL-NEXT: .LBB76_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB76_2 ; GFX950-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_4 ; GFX1250-SDAG-NEXT: .LBB77_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_4 ; GFX1250-GISEL-NEXT: .LBB77_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB77_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB77_4 ; GFX950-SDAG-NEXT: .LBB77_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB77_2 ; GFX950-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB77_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB77_4 ; GFX950-GISEL-NEXT: .LBB77_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB77_2 ; GFX950-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw umax ptr %gep1, i64 %data syncscope("workgroup") seq_cst ret void } ; -------------------------------------------------------------------------------- ; atomicrmw umin ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_umin v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umin v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_umin v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umin v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw umin ptr %gep1, i32 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw umin ptr %gep1, i32 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB82_5 ; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_u64 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB82_5 ; GFX1250-SDAG-NEXT: .LBB82_5: ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4 ; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB82_5 ; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_u64 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB82_5 ; GFX1250-GISEL-NEXT: .LBB82_5: ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_4 ; GFX950-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB82_5 ; GFX950-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX950-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB82_5 ; GFX950-SDAG-NEXT: .LBB82_5: ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_4 ; GFX950-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB82_5 ; GFX950-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX950-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB82_5 ; GFX950-GISEL-NEXT: .LBB82_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB83_5 ; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_u64 v[2:3], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB83_5 ; GFX1250-SDAG-NEXT: .LBB83_5: ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4 ; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB83_5 ; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_u64 v[2:3], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB83_5 ; GFX1250-GISEL-NEXT: .LBB83_5: ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_4 ; GFX950-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB83_5 ; GFX950-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX950-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB83_5 ; GFX950-SDAG-NEXT: .LBB83_5: ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_4 ; GFX950-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB83_5 ; GFX950-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX950-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB83_5 ; GFX950-GISEL-NEXT: .LBB83_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw umin ptr %gep1, i64 %data syncscope("workgroup") seq_cst %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_4 ; GFX1250-SDAG-NEXT: .LBB84_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_4 ; GFX1250-GISEL-NEXT: .LBB84_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB84_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB84_4 ; GFX950-SDAG-NEXT: .LBB84_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB84_2 ; GFX950-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB84_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB84_4 ; GFX950-GISEL-NEXT: .LBB84_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB84_2 ; GFX950-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst ret void } define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_4 ; GFX1250-SDAG-NEXT: .LBB85_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_4 ; GFX1250-GISEL-NEXT: .LBB85_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB85_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB85_4 ; GFX950-SDAG-NEXT: .LBB85_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB85_2 ; GFX950-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB85_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB85_4 ; GFX950-GISEL-NEXT: .LBB85_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB85_2 ; GFX950-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw umin ptr %gep1, i64 %data syncscope("workgroup") seq_cst ret void } ; -------------------------------------------------------------------------------- ; cmpxchg ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst %rtn = extractvalue { i32, i1 } %cmpxchg, 0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %cmpxchg = cmpxchg ptr %gep1, i32 %cmp, i32 %data seq_cst seq_cst %rtn = extractvalue { i32, i1 } %cmpxchg, 0 %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst ret void } define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: v_mov_b32_e32 v3, v1 ; GFX1250-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = cmpxchg ptr %gep1, i32 %cmp, i32 %data seq_cst seq_cst ret void } define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_4 ; GFX1250-SDAG-NEXT: .LBB90_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB90_5 ; GFX1250-SDAG-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[2:3], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v8, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB90_5 ; GFX1250-SDAG-NEXT: .LBB90_5: ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_4 ; GFX1250-GISEL-NEXT: .LBB90_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB90_5 ; GFX1250-GISEL-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7 ; GFX1250-GISEL-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB90_5 ; GFX1250-GISEL-NEXT: .LBB90_5: ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB90_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB90_4 ; GFX950-SDAG-NEXT: .LBB90_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB90_5 ; GFX950-SDAG-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB90_2 ; GFX950-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v8, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v8, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB90_5 ; GFX950-SDAG-NEXT: .LBB90_5: ; ; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB90_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB90_4 ; GFX950-GISEL-NEXT: .LBB90_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB90_5 ; GFX950-GISEL-NEXT: .LBB90_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB90_2 ; GFX950-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB90_5 ; GFX950-GISEL-NEXT: .LBB90_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst %rtn = extractvalue { i64, i1 } %cmpxchg, 0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_4 ; GFX1250-SDAG-NEXT: .LBB91_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB91_5 ; GFX1250-SDAG-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[2:3], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v8, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB91_5 ; GFX1250-SDAG-NEXT: .LBB91_5: ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_4 ; GFX1250-GISEL-NEXT: .LBB91_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB91_5 ; GFX1250-GISEL-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7 ; GFX1250-GISEL-NEXT: scratch_store_b64 v4, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB91_5 ; GFX1250-GISEL-NEXT: .LBB91_5: ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB91_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB91_4 ; GFX950-SDAG-NEXT: .LBB91_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_branch .LBB91_5 ; GFX950-SDAG-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB91_2 ; GFX950-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v8, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v8, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB91_5 ; GFX950-SDAG-NEXT: .LBB91_5: ; ; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB91_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB91_4 ; GFX950-GISEL-NEXT: .LBB91_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_branch .LBB91_5 ; GFX950-GISEL-NEXT: .LBB91_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB91_2 ; GFX950-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v4, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v4, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB91_5 ; GFX950-GISEL-NEXT: .LBB91_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %cmpxchg = cmpxchg ptr %gep1, i64 %cmp, i64 %data seq_cst seq_cst %rtn = extractvalue { i64, i1 } %cmpxchg, 0 %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_4 ; GFX1250-SDAG-NEXT: .LBB92_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB92_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:7] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_4 ; GFX1250-GISEL-NEXT: .LBB92_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB92_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB92_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB92_4 ; GFX950-SDAG-NEXT: .LBB92_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB92_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB92_2 ; GFX950-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB92_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB92_4 ; GFX950-GISEL-NEXT: .LBB92_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB92_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB92_2 ; GFX950-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst ret void } define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_4 ; GFX1250-SDAG-NEXT: .LBB93_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB93_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:7] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_4 ; GFX1250-GISEL-NEXT: .LBB93_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB93_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB93_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB93_4 ; GFX950-SDAG-NEXT: .LBB93_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB93_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB93_2 ; GFX950-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB93_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB93_4 ; GFX950-GISEL-NEXT: .LBB93_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB93_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB93_2 ; GFX950-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = cmpxchg ptr %gep1, i64 %cmp, i64 %data seq_cst seq_cst ret void } ; -------------------------------------------------------------------------------- ; amdgcn atomic inc ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_inc_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_inc v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_inc_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_inc v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_inc_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_inc v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_inc_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_inc v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw uinc_wrap ptr %gep1, i32 %data syncscope("agent") monotonic %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_inc_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_inc v[0:1], v2 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_inc_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_inc v[2:3], v1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic ret void } define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_inc_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_inc_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_inc v[0:1], v2 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_inc_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_inc v[2:3], v1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw uinc_wrap ptr %gep1, i32 %data syncscope("agent") monotonic ret void } define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_4 ; GFX1250-SDAG-NEXT: .LBB98_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB98_5 ; GFX1250-SDAG-NEXT: .LBB98_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB98_5 ; GFX1250-SDAG-NEXT: .LBB98_5: ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_4 ; GFX1250-GISEL-NEXT: .LBB98_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB98_5 ; GFX1250-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB98_5 ; GFX1250-GISEL-NEXT: .LBB98_5: ; ; GFX950-SDAG-LABEL: flat_inc_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB98_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB98_4 ; GFX950-SDAG-NEXT: .LBB98_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB98_5 ; GFX950-SDAG-NEXT: .LBB98_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB98_2 ; GFX950-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 ; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB98_5 ; GFX950-SDAG-NEXT: .LBB98_5: ; ; GFX950-GISEL-LABEL: flat_inc_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB98_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB98_4 ; GFX950-GISEL-NEXT: .LBB98_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB98_5 ; GFX950-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB98_2 ; GFX950-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB98_5 ; GFX950-GISEL-NEXT: .LBB98_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_4 ; GFX1250-SDAG-NEXT: .LBB99_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB99_5 ; GFX1250-SDAG-NEXT: .LBB99_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_branch .LBB99_5 ; GFX1250-SDAG-NEXT: .LBB99_5: ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_4 ; GFX1250-GISEL-NEXT: .LBB99_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB99_5 ; GFX1250-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB99_5 ; GFX1250-GISEL-NEXT: .LBB99_5: ; ; GFX950-SDAG-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB99_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB99_4 ; GFX950-SDAG-NEXT: .LBB99_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB99_5 ; GFX950-SDAG-NEXT: .LBB99_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB99_2 ; GFX950-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 ; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB99_5 ; GFX950-SDAG-NEXT: .LBB99_5: ; ; GFX950-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB99_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB99_4 ; GFX950-GISEL-NEXT: .LBB99_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB99_5 ; GFX950-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB99_2 ; GFX950-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB99_5 ; GFX950-GISEL-NEXT: .LBB99_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw uinc_wrap ptr %gep1, i64 %data syncscope("agent") monotonic %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_4 ; GFX1250-SDAG-NEXT: .LBB100_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB100_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_4 ; GFX1250-GISEL-NEXT: .LBB100_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_inc_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB100_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB100_4 ; GFX950-SDAG-NEXT: .LBB100_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB100_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB100_2 ; GFX950-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 ; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_inc_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB100_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB100_4 ; GFX950-GISEL-NEXT: .LBB100_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB100_2 ; GFX950-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic ret void } define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_4 ; GFX1250-SDAG-NEXT: .LBB101_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB101_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_4 ; GFX1250-GISEL-NEXT: .LBB101_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB101_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB101_4 ; GFX950-SDAG-NEXT: .LBB101_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB101_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB101_2 ; GFX950-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 ; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB101_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB101_4 ; GFX950-GISEL-NEXT: .LBB101_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB101_2 ; GFX950-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw uinc_wrap ptr %gep1, i64 %data syncscope("agent") monotonic ret void } ; -------------------------------------------------------------------------------- ; amdgcn atomic dec ; -------------------------------------------------------------------------------- define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_dec_saddr_i32_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_dec v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_dec_saddr_i32_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_dec v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_rtn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog ; ; GFX950-SDAG-LABEL: flat_dec_saddr_i32_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_dec v0, v[0:1], v2 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: ; return to shader part epilog ; ; GFX950-GISEL-LABEL: flat_dec_saddr_i32_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_dec v0, v[2:3], v1 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw udec_wrap ptr %gep1, i32 %data syncscope("agent") monotonic %cast.rtn = bitcast i32 %rtn to float ret float %cast.rtn } define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_dec_saddr_i32_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: flat_atomic_dec v[0:1], v2 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_dec_saddr_i32_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_dec v[2:3], v1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic ret void } define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_dec_saddr_i32_nortn_neg128: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_dec_saddr_i32_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-SDAG-NEXT: flat_atomic_dec v[0:1], v2 ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_dec_saddr_i32_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX950-GISEL-NEXT: flat_atomic_dec v[2:3], v1 ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw udec_wrap ptr %gep1, i32 %data syncscope("agent") monotonic ret void } define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_4 ; GFX1250-SDAG-NEXT: .LBB106_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB106_5 ; GFX1250-SDAG-NEXT: .LBB106_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1] ; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-SDAG-NEXT: s_branch .LBB106_5 ; GFX1250-SDAG-NEXT: .LBB106_5: ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_4 ; GFX1250-GISEL-NEXT: .LBB106_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB106_5 ; GFX1250-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1] ; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-GISEL-NEXT: s_branch .LBB106_5 ; GFX1250-GISEL-NEXT: .LBB106_5: ; ; GFX950-SDAG-LABEL: flat_dec_saddr_i64_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB106_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB106_4 ; GFX950-SDAG-NEXT: .LBB106_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB106_5 ; GFX950-SDAG-NEXT: .LBB106_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB106_2 ; GFX950-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 ; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB106_5 ; GFX950-SDAG-NEXT: .LBB106_5: ; ; GFX950-GISEL-LABEL: flat_dec_saddr_i64_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB106_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB106_4 ; GFX950-GISEL-NEXT: .LBB106_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB106_5 ; GFX950-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB106_2 ; GFX950-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB106_5 ; GFX950-GISEL-NEXT: .LBB106_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_4 ; GFX1250-SDAG-NEXT: .LBB107_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_branch .LBB107_5 ; GFX1250-SDAG-NEXT: .LBB107_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], -1, v[0:1] ; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-SDAG-NEXT: s_branch .LBB107_5 ; GFX1250-SDAG-NEXT: .LBB107_5: ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_4 ; GFX1250-GISEL-NEXT: .LBB107_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB107_5 ; GFX1250-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], -1, v[0:1] ; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-GISEL-NEXT: s_branch .LBB107_5 ; GFX1250-GISEL-NEXT: .LBB107_5: ; ; GFX950-SDAG-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB107_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB107_4 ; GFX950-SDAG-NEXT: .LBB107_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB107_5 ; GFX950-SDAG-NEXT: .LBB107_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB107_2 ; GFX950-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 ; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_branch .LBB107_5 ; GFX950-SDAG-NEXT: .LBB107_5: ; ; GFX950-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB107_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB107_4 ; GFX950-GISEL-NEXT: .LBB107_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB107_5 ; GFX950-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[4:5] sc0 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB107_2 ; GFX950-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_branch .LBB107_5 ; GFX950-GISEL-NEXT: .LBB107_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %rtn = atomicrmw udec_wrap ptr %gep1, i64 %data syncscope("agent") monotonic %cast.rtn = bitcast i64 %rtn to <2 x float> ret <2 x float> %cast.rtn } define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_4 ; GFX1250-SDAG-NEXT: .LBB108_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB108_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1] ; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_4 ; GFX1250-GISEL-NEXT: .LBB108_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1] ; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_dec_saddr_i64_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB108_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB108_4 ; GFX950-SDAG-NEXT: .LBB108_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB108_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB108_2 ; GFX950-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 ; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_dec_saddr_i64_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB108_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB108_4 ; GFX950-GISEL-NEXT: .LBB108_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB108_2 ; GFX950-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, -1, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic ret void } define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_4 ; GFX1250-SDAG-NEXT: .LBB109_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_endpgm ; GFX1250-SDAG-NEXT: .LBB109_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1] ; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2 ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_4 ; GFX1250-GISEL-NEXT: .LBB109_2: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], -1, v[0:1] ; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm ; ; GFX950-SDAG-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] ; GFX950-SDAG-NEXT: s_movk_i32 s2, 0xff80 ; GFX950-SDAG-NEXT: s_mov_b32 s3, -1 ; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB109_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB109_4 ; GFX950-SDAG-NEXT: .LBB109_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_endpgm ; GFX950-SDAG-NEXT: .LBB109_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB109_2 ; GFX950-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] ; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 ; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off ; GFX950-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 ; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB109_3 ; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB109_4 ; GFX950-GISEL-NEXT: .LBB109_2: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_endpgm ; GFX950-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global ; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5] ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execz .LBB109_2 ; GFX950-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, -1, v0 ; GFX950-GISEL-NEXT: s_nop 1 ; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc ; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] ; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc ; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 %unused = atomicrmw udec_wrap ptr %gep1, i64 %data syncscope("agent") monotonic ret void } define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-LABEL: flat_atomic_fadd_f64_saddr_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB110_5 ; GFX1250-SDAG-NEXT: s_branch .LBB110_6 ; GFX1250-SDAG-NEXT: .LBB110_3: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: s_branch .LBB110_7 ; GFX1250-SDAG-NEXT: .LBB110_4: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[4:5], s2 ; GFX1250-SDAG-NEXT: .LBB110_6: ; %Flow1 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB110_8 ; GFX1250-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-GISEL-LABEL: flat_atomic_fadd_f64_saddr_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_6 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_3 ; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: .LBB110_3: ; %Flow ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_5 ; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[4:5], s2 ; GFX1250-GISEL-NEXT: .LBB110_5: ; %Flow1 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1250-GISEL-NEXT: .LBB110_6: ; %Flow2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_8 ; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_f64_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB110_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB110_4 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB110_5 ; GFX950-SDAG-NEXT: s_branch .LBB110_6 ; GFX950-SDAG-NEXT: .LBB110_3: ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: s_branch .LBB110_7 ; GFX950-SDAG-NEXT: .LBB110_4: ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s2, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[4:5], s2 ; GFX950-SDAG-NEXT: .LBB110_6: ; %Flow1 ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB110_8 ; GFX950-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fadd_f64_saddr_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50 ; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB110_6 ; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB110_3 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 ; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX950-GISEL-NEXT: .LBB110_3: ; %Flow ; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB110_5 ; GFX950-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s2, s0, -1 ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] ; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[4:5], s2 ; GFX950-GISEL-NEXT: .LBB110_5: ; %Flow1 ; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX950-GISEL-NEXT: .LBB110_6: ; %Flow2 ; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB110_8 ; GFX950-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret double %result } define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-LABEL: flat_atomic_fadd_f64_saddr_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow2 ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_8 ; GFX1250-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_5 ; GFX1250-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: .LBB111_5: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 ; GFX1250-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_f64_e32 v[2:3], v[2:3], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[2:3], s2 ; GFX1250-SDAG-NEXT: .LBB111_7: ; %Flow1 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB111_2 ; GFX1250-SDAG-NEXT: .LBB111_8: ; %atomicrmw.shared ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: ds_add_f64 v2, v[0:1] ; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-GISEL-LABEL: flat_atomic_fadd_f64_saddr_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50 ; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_3 ; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: .LBB111_3: ; %Flow ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_5 ; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_f64_e32 v[2:3], v[2:3], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[2:3], s2 ; GFX1250-GISEL-NEXT: .LBB111_5: ; %Flow1 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX1250-GISEL-NEXT: .LBB111_6: ; %Flow2 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_8 ; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: ds_add_f64 v2, v[0:1] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_f64_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 ; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB111_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow2 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB111_8 ; GFX950-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; GFX950-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 ; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB111_5 ; GFX950-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: .LBB111_5: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 ; GFX950-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s2, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; GFX950-SDAG-NEXT: .LBB111_7: ; %Flow1 ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB111_2 ; GFX950-SDAG-NEXT: .LBB111_8: ; %atomicrmw.shared ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-SDAG-NEXT: ds_add_f64 v2, v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fadd_f64_saddr_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50 ; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base ; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 ; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private ; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB111_3 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global ; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX950-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX950-GISEL-NEXT: .LBB111_3: ; %Flow ; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB111_5 ; GFX950-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private ; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s2, s0, -1 ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s2 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] ; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; GFX950-GISEL-NEXT: .LBB111_5: ; %Flow1 ; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 ; GFX950-GISEL-NEXT: .LBB111_6: ; %Flow2 ; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB111_8 ; GFX950-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared ; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-GISEL-NEXT: ds_add_f64 v2, v[0:1] ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-LABEL: flat_atomic_fmax_f64_saddr_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB112_2 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB112_3 ; GFX1250-SDAG-NEXT: s_branch .LBB112_4 ; GFX1250-SDAG-NEXT: .LBB112_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-GISEL-LABEL: flat_atomic_fmax_f64_saddr_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB112_2 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB112_2: ; %Flow ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB112_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_f64_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB112_2 ; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB112_3 ; GFX950-SDAG-NEXT: s_branch .LBB112_4 ; GFX950-SDAG-NEXT: .LBB112_2: ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmax_f64_saddr_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 ; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 ; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB112_2 ; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] offset:80 sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX950-GISEL-NEXT: .LBB112_2: ; %Flow ; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB112_4 ; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] ; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret double %result } define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-LABEL: flat_atomic_fmax_f64_saddr_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB113_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB113_4 ; GFX1250-SDAG-NEXT: .LBB113_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2 ; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-GISEL-LABEL: flat_atomic_fmax_f64_saddr_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] offset:80 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB113_2: ; %Flow ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB113_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-GISEL-NEXT: .LBB113_4: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_f64_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 ; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB113_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB113_4 ; GFX950-SDAG-NEXT: .LBB113_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; GFX950-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: flat_atomic_max_f64 v[2:3], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB113_2 ; GFX950-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmax_f64_saddr_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 ; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 ; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 ; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_atomic_max_f64 v[2:3], v[0:1] offset:80 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX950-GISEL-NEXT: .LBB113_2: ; %Flow ; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB113_4 ; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] ; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-GISEL-NEXT: .LBB113_4: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-LABEL: flat_atomic_fmin_f64_saddr_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB114_2 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB114_3 ; GFX1250-SDAG-NEXT: s_branch .LBB114_4 ; GFX1250-SDAG-NEXT: .LBB114_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-GISEL-LABEL: flat_atomic_fmin_f64_saddr_rtn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB114_2 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB114_2: ; %Flow ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB114_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_f64_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB114_2 ; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0 ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_cbranch_execz .LBB114_3 ; GFX950-SDAG-NEXT: s_branch .LBB114_4 ; GFX950-SDAG-NEXT: .LBB114_2: ; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmin_f64_saddr_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 ; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 ; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB114_2 ; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] offset:80 sc0 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX950-GISEL-NEXT: .LBB114_2: ; %Flow ; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB114_4 ; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] ; GFX950-GISEL-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] ; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3 ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret double %result } define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-LABEL: flat_atomic_fmin_f64_saddr_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB115_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB115_4 ; GFX1250-SDAG-NEXT: .LBB115_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2 ; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private ; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; ; GFX1250-GISEL-LABEL: flat_atomic_fmin_f64_saddr_nortn: ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] offset:80 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: .LBB115_2: ; %Flow ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB115_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] ; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX1250-GISEL-NEXT: .LBB115_4: ; %atomicrmw.phi ; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_f64_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base ; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 ; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 ; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB115_3 ; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow ; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB115_4 ; GFX950-SDAG-NEXT: .LBB115_2: ; %atomicrmw.phi ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; GFX950-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: flat_atomic_min_f64 v[2:3], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB115_2 ; GFX950-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private ; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 ; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmin_f64_saddr_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 ; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base ; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 ; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 ; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_atomic_min_f64 v[2:3], v[0:1] offset:80 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 ; GFX950-GISEL-NEXT: .LBB115_2: ; %Flow ; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 ; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB115_4 ; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private ; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 ; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 ; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX950-GISEL-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] ; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 ; GFX950-GISEL-NEXT: .LBB115_4: ; %atomicrmw.phi ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define float @flat_atomic_fadd_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-LABEL: flat_atomic_fadd_f32_saddr_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX950-SDAG-NEXT: flat_atomic_add_f32 v0, v[2:3], v0 offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fadd_f32_saddr_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_atomic_add_f32 v0, v[2:3], v0 offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 %result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret float %result } define void @flat_atomic_fadd_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-LABEL: flat_atomic_fadd_f32_saddr_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_add_f32 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX950-SDAG-NEXT: flat_atomic_add_f32 v[2:3], v0 offset:40 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fadd_f32_saddr_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_atomic_add_f32 v[2:3], v0 offset:40 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-LABEL: flat_atomic_fmax_f32_saddr_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX950-SDAG-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 ; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmax_f32_saddr_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX950-GISEL-NEXT: .LBB118_1: ; %atomicrmw.start ; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 ; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB118_1 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 %result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret float %result } define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-LABEL: flat_atomic_fmax_f32_saddr_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_max_num_f32 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v2, v4 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmax_f32_saddr_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-GISEL-NEXT: .LBB119_1: ; %atomicrmw.start ; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB119_1 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { ; GFX1250-LABEL: flat_atomic_fmin_f32_saddr_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX950-SDAG-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 ; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v0, v1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmin_f32_saddr_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 ; GFX950-GISEL-NEXT: .LBB120_1: ; %atomicrmw.start ; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 ; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v5, v5 ; GFX950-GISEL-NEXT: v_min_f32_e32 v4, v0, v1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB120_1 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 %result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret float %result } define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) { ; GFX1250-LABEL: flat_atomic_fmin_f32_saddr_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_min_num_f32 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3 ; GFX950-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmin_f32_saddr_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v0 ; GFX950-GISEL-NEXT: .LBB121_1: ; %atomicrmw.start ; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX950-GISEL-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB121_1 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { ; GFX1250-LABEL: flat_atomic_fadd_v2f16_saddr_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_f16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX950-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[2:3], v0 offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fadd_v2f16_saddr_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[2:3], v0 offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 %result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } define void @flat_atomic_fadd_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { ; GFX1250-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_f16 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 ; GFX950-SDAG-NEXT: flat_atomic_pk_add_f16 v[2:3], v0 offset:40 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_atomic_pk_add_f16 v[2:3], v0 offset:40 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { ; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 ; GFX1250-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v5, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 ; GFX1250-NEXT: v_pk_max_num_f16 v4, v0, v1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB124_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX950-SDAG-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX950-SDAG-NEXT: s_nop 0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX950-GISEL-NEXT: .LBB124_1: ; %atomicrmw.start ; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 ; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB124_1 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 %result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { ; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 ; GFX1250-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v3 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB125_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX950-SDAG-NEXT: s_nop 0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v4 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-GISEL-NEXT: .LBB125_1: ; %atomicrmw.start ; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v4 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB125_1 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { ; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1 ; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 ; GFX1250-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v5, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 ; GFX1250-NEXT: v_pk_min_num_f16 v4, v0, v1 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB126_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX950-SDAG-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX950-SDAG-NEXT: s_nop 0 ; GFX950-SDAG-NEXT: v_pk_min_f16 v4, v0, v1 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 ; GFX950-GISEL-NEXT: .LBB126_1: ; %atomicrmw.start ; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 ; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v5, v5 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_pk_min_f16 v4, v0, v1 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 ; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB126_1 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 %result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %result } define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { ; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 ; GFX1250-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v3 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 ; GFX1250-NEXT: v_mov_b32_e32 v1, v0 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB127_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: ; GFX950-SDAG: ; %bb.0: ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 ; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX950-SDAG-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 ; GFX950-SDAG-NEXT: s_nop 0 ; GFX950-SDAG-NEXT: v_pk_min_f16 v2, v2, v4 ; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 ; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 ; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB127_1 ; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX950-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: ; GFX950-GISEL: ; %bb.0: ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 ; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0 ; GFX950-GISEL-NEXT: .LBB127_1: ; %atomicrmw.start ; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v1, v1 ; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_pk_min_f16 v0, v0, v4 ; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB127_1 ; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define <2 x bfloat> @flat_atomic_fadd_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) { ; GFX1250-LABEL: flat_atomic_fadd_v2bf16_saddr_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_bf16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_rtn: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-NEXT: v_mov_b32_e32 v3, s1 ; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[2:3], v0 offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 %result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } define void @flat_atomic_fadd_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) { ; GFX1250-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: flat_atomic_pk_add_bf16 v1, v0, s[0:1] offset:40 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b32_e32 v2, s0 ; GFX950-NEXT: v_mov_b32_e32 v3, s1 ; GFX950-NEXT: flat_atomic_pk_add_bf16 v[2:3], v0 offset:40 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) { ; GFX1250-LABEL: flat_atomic_fmax_v2bf16_saddr_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 ; GFX1250-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v5, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_max_num_bf16 v4, v5, v0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB130_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: v_mov_b32_e32 v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_rtn: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b32_e32 v7, v0 ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 ; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX950-NEXT: v_max_f32_e32 v5, v5, v1 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0 ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB130_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 %result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } define void @flat_atomic_fmax_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) { ; GFX1250-LABEL: flat_atomic_fmax_v2bf16_saddr_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40 ; GFX1250-NEXT: .LBB131_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_pk_max_num_bf16 v2, v3, v0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX1250-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB131_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_nortn: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX950-NEXT: .LBB131_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 ; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB131_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) { ; GFX1250-LABEL: flat_atomic_fmin_v2bf16_saddr_rtn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 ; GFX1250-NEXT: .LBB132_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v5, v1 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_pk_min_num_bf16 v4, v5, v0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB132_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: v_mov_b32_e32 v0, v1 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_rtn: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX950-NEXT: .LBB132_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b32_e32 v7, v0 ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 ; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7 ; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX950-NEXT: v_min_f32_e32 v5, v5, v1 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0 ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB132_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 %result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %result } define void @flat_atomic_fmin_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) { ; GFX1250-LABEL: flat_atomic_fmin_v2bf16_saddr_nortn: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_mov_b32 s2, 0 ; GFX1250-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40 ; GFX1250-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_pk_min_num_bf16 v2, v3, v0 ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 ; GFX1250-NEXT: v_mov_b32_e32 v3, v2 ; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_cbranch_execnz .LBB133_1 ; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; ; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_nortn: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40 ; GFX950-NEXT: s_mov_b64 s[2:3], 0 ; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 ; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX950-NEXT: .LBB133_1: ; %atomicrmw.start ; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 ; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 ; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 ; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 ; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 ; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 ; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX950-NEXT: v_mov_b32_e32 v1, v0 ; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_cbranch_execnz .LBB133_1 ; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX950-NEXT: s_setpc_b64 s[30:31] %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 %unused = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 ret void } attributes #0 = { argmemonly nounwind willreturn } !0 = !{}