diff options
author | Vitaly Buka <vitalybuka@google.com> | 2024-04-02 14:23:42 -0700 |
---|---|---|
committer | Vitaly Buka <vitalybuka@google.com> | 2024-04-02 14:23:42 -0700 |
commit | 2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310 (patch) | |
tree | 4a2ce5eb31e8242dcbb7d7a3de82d3309fdc23c5 /llvm/test/CodeGen/AMDGPU | |
parent | eb6a41808ef4e058a24f9ebc6c85b10c966eb183 (diff) | |
parent | 89271b46761749503dffe94c60b9cbe0bda80284 (diff) | |
download | llvm-2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310.zip llvm-2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310.tar.gz llvm-2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310.tar.bz2 |
[𝘀𝗽𝗿] changes introduced through rebase
Created using spr 1.3.4
[skip ci]
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
117 files changed, 36973 insertions, 13816 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll index 255c6de..1a76f8c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1090,18 +1090,29 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB39_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1109,20 +1120,31 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB39_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB39_2 +; GFX90A-NEXT: .LBB39_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB39_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB39_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst @@ -1132,26 +1154,47 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB40_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB40_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB40_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB40_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1161,18 +1204,29 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB41_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1180,20 +1234,31 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB41_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB41_2 +; GFX90A-NEXT: .LBB41_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB41_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB41_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst @@ -1203,26 +1268,47 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB42_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB42_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB42_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB42_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1394,37 +1480,59 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB49_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB49_2 +; GFX90A-NEXT: .LBB49_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB49_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB49_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1866,23 +1974,44 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB65_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB65_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB65_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB65_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -1892,23 +2021,44 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB66_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB66_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB66_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB66_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -1918,44 +2068,66 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: s_mov_b32 s4, s3 +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB67_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v2, s0 -; GFX90A-NEXT: ds_read_b64 v[0:1], v2 +; GFX90A-NEXT: v_mov_b32_e32 v4, s0 +; GFX90A-NEXT: ds_read_b64 v[2:3], v4 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0 -; GFX90A-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB67_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[4:5], v[0:1], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5] +; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB67_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB67_2 +; GFX90A-NEXT: .LBB67_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: s_mov_b32 s4, s3 +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB67_3 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-NEXT: ds_read_b64 v[0:1], v2 +; GFX940-NEXT: v_mov_b32_e32 v4, s0 +; GFX940-NEXT: ds_read_b64 v[2:3], v4 ; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX940-NEXT: .LBB67_2: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[4:5], v[0:1], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5] +; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] +; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1] +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB67_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_cbranch_execnz .LBB67_2 +; GFX940-NEXT: .LBB67_3: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir index e288d9d..eafd1e1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir @@ -16,7 +16,8 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p1), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s32), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s32) = COPY $vgpr3 @@ -40,7 +41,8 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32) ; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p0), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32)) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s32), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1) %0:_(p0) = COPY $vgpr0_vgpr1 %1:_(s32) = COPY $vgpr2 %2:_(s32) = COPY $vgpr3 @@ -63,7 +65,8 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 ; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 3) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s32), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ATOMIC_CMPXCHG]](s32) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1) %0:_(p3) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s32) = COPY $vgpr2 @@ -87,7 +90,8 @@ body: | ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY2]](s64), [[COPY1]](s64) ; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p1), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s64), addrspace 1) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s64), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s64), implicit [[ICMP]](s1) %0:_(p1) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 %2:_(s64) = COPY $vgpr4_vgpr5 @@ -110,7 +114,8 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr3_vgpr4 ; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic (s64), addrspace 3) ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s64), [[COPY1]] - ; CHECK-NEXT: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[ATOMIC_CMPXCHG]](s64) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s64), implicit [[ICMP]](s1) %0:_(p3) = COPY $vgpr0 %1:_(s64) = COPY $vgpr1_vgpr2 %2:_(s64) = COPY $vgpr3_vgpr4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir index e9f8180..fed277d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir @@ -64,9 +64,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s32) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[CTLZ_ZERO_UNDEF]], [[C]] - ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[CTLZ_ZERO_UNDEF]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s16) = G_CTLZ_ZERO_UNDEF %0 %2:_(s32) = G_ZEXT %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir index dba20e1..eb86a98 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir @@ -86,8 +86,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD]](s32), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -117,8 +118,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -172,11 +174,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY [[BITCAST2]](<2 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND2]](s32), [[AND3]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](<2 x s16>) ; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -360,13 +363,14 @@ body: | ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR3]](s1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s16>) = COPY [[CONCAT_VECTORS]](<4 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]] ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32), [[AND6]](s32), [[AND7]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY5]](<4 x s16>) ; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 @@ -403,11 +407,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[BUILD_VECTOR]](<2 x s32>) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]] ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](<2 x s32>) ; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir index 93d0071..80b3166 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir @@ -955,15 +955,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX8-LABEL: name: saddsat_s64 @@ -980,15 +981,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX9-LABEL: name: saddsat_s64 @@ -1005,15 +1007,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -1043,15 +1046,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]] @@ -1060,13 +1064,14 @@ body: | ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]] ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]] ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32) - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1086,15 +1091,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]] @@ -1103,13 +1109,14 @@ body: | ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]] ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]] ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32) - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1129,15 +1136,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]] @@ -1146,13 +1154,14 @@ body: | ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]] ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]] ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32) - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir index 57b1ab9..220450c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir @@ -86,8 +86,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32) ; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 @@ -117,8 +118,9 @@ body: | ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64) ; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -172,11 +174,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY [[BITCAST2]](<2 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND2]](s32), [[AND3]](s32) - ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>) + ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](<2 x s16>) ; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s16>) = COPY $vgpr0 %1:_(<2 x s16>) = COPY $vgpr1 @@ -360,13 +363,14 @@ body: | ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1) ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR3]](s1) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s16>) = COPY [[CONCAT_VECTORS]](<4 x s16>) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]] ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]] ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]] ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C3]] ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32), [[AND6]](s32), [[AND7]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY5]](<4 x s16>) ; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(<4 x s16>) = COPY $vgpr0_vgpr1 %1:_(<4 x s16>) = COPY $vgpr1_vgpr2 @@ -403,11 +407,12 @@ body: | ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]] ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1) ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[BUILD_VECTOR]](<2 x s32>) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]] ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](<2 x s32>) ; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir index 33a8cda..49fb6e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir @@ -955,15 +955,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX8-LABEL: name: ssubsat_s64 @@ -980,15 +981,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) ; ; GFX9-LABEL: name: ssubsat_s64 @@ -1005,15 +1007,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = COPY $vgpr2_vgpr3 @@ -1043,15 +1046,16 @@ body: | ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] ; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]] ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] @@ -1060,13 +1064,14 @@ body: | ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] ; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]] ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1086,15 +1091,16 @@ body: | ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]] ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] @@ -1103,13 +1109,14 @@ body: | ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]] ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -1129,15 +1136,16 @@ body: | ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]] ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]] ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]] + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63 - ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32) + ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32) ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808 ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64) ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]] ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]] ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32) - ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]] + ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]] ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64) ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64) ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]] @@ -1146,13 +1154,14 @@ body: | ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]] ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]] ; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]] - ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32) + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64) + ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32) ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64) ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64) ; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]] ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]] ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32) - ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]] + ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]] ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir index b4bc648..305eca7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir @@ -24,7 +24,7 @@ body: | bb.0: %0:_(s8) = G_CONSTANT i8 0 %1:_(p1) = G_CONSTANT i64 0 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap) + G_TRAP bb.1: G_STORE %0, %1 :: (store 1, addrspace 1) @@ -55,7 +55,7 @@ body: | ; GCN-NEXT: S_ENDPGM 0 bb.0: %0:_(s8) = G_CONSTANT i8 0 - G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap) + G_TRAP %1:_(p1) = G_CONSTANT i64 0 bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll index 623360f..de46037 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -147,6 +147,34 @@ main_body: ret half %res } +define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_rtz_f16: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: v_add_f16_e32 v0, v3, v0 +; GCN-NEXT: ; return to shader part epilog +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0) + %l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0) + %h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1) + %h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1) + %res = fadd half %l_p1, %h_p1 + ret half %res +} + define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 { ; GCN-LABEL: v_interp_f16_imm_params: ; GCN: ; %bb.0: ; %main_body @@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0 declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0 declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0 +declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0 +declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index 6eed92b..6d4aa3b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -670,36 +670,19 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { ; GFX6-LABEL: simplify_demanded_bfe_sdiv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, 2.0 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_load_dword s0, s[6:7], 0x0 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, -2 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100001 -; GFX6-NEXT: s_ashr_i32 s2, s0, 31 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_add_i32 s0, s0, s2 -; GFX6-NEXT: s_xor_b32 s0, s0, s2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_subrev_i32_e64 v2, s[0:1], 2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100001 +; GFX6-NEXT: s_ashr_i32 s4, s3, 31 +; GFX6-NEXT: s_lshr_b32 s4, s4, 31 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_ashr_i32 s3, s3, 1 +; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm %src = load i32, ptr addrspace(1) %in, align 4 %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll index 686b849..06bd45a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s +; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX8-LABEL: name: struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll index 9edc2455..1e3f94a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted define amdgpu_ps float @struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index d36f5c0..a6f9bb7e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4142,11 +4142,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4162,7 +4162,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4179,7 +4179,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: s_ashr_i32 s5, s7, 15 -; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX6-NEXT: s_addk_i32 s2, 0x8000 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX8-NEXT: s_ashr_i32 s2, s7, 31 ; GFX8-NEXT: s_ashr_i32 s5, s7, 15 -; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9-LABEL: saddsat_i48_vs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4529,11 +4529,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4546,7 +4546,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4560,7 +4560,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 -; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4866,21 +4866,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4896,10 +4895,10 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7] +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo @@ -4921,8 +4920,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] ; GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0, v[6:7] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 @@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 @@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 @@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 @@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s8, s1, s0 ; GFX10-NEXT: s_add_u32 s0, s2, s6 ; GFX10-NEXT: s_addc_u32 s1, s3, s7 @@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s1, s3, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0 ; GFX11-NEXT: s_ashr_i32 s4, s9, 31 -; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s8, s1, s0 ; GFX11-NEXT: s_add_u32 s0, s2, s6 ; GFX11-NEXT: s_addc_u32 s1, s3, s7 @@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s1, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5132,7 +5131,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s9, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 @@ -5179,7 +5178,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s9, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 @@ -5226,7 +5225,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s9, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 @@ -5269,7 +5268,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, s4 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5310,7 +5309,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX11-NEXT: v_mov_b32_e32 v2, s5 ; GFX11-NEXT: s_ashr_i32 s0, s9, 31 -; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5412,9 +5411,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX9-NEXT: v_bfrev_b32_e32 v6, 1 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v3, v6 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_add_u32_e32 v6, 0x80000000, v3 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc @@ -5440,7 +5438,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX10-NEXT: v_add_co_u32 v6, s0, 0x80000000, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo @@ -5467,7 +5465,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX11-NEXT: v_add_co_u32 v6, null, 0x80000000, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3 ; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo @@ -5569,9 +5567,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -5597,9 +5594,9 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5627,15 +5624,14 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs) %cast = bitcast i128 %result to <4 x float> @@ -5762,12 +5758,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v17 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc @@ -5786,11 +5781,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc @@ -5832,18 +5827,18 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v17 -; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v6, s4 @@ -5882,18 +5877,17 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v17 -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3 ; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0 @@ -5927,7 +5921,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s17, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 @@ -5960,7 +5954,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 @@ -6011,7 +6005,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s17, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 @@ -6050,7 +6044,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -6101,7 +6095,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s17, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -6140,7 +6134,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -6184,7 +6178,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_ashr_i32 s10, s17, 31 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: s_add_u32 s11, s10, 0x80000000 +; GFX10-NEXT: s_add_i32 s11, s10, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: s_add_u32 s0, s4, s12 @@ -6221,7 +6215,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_mov_b32_e32 v2, s17 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: v_readfirstlane_b32 s1, v4 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo @@ -6261,7 +6255,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: s_and_b32 s1, 1, s1 ; GFX11-NEXT: s_ashr_i32 s10, s17, 31 ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX11-NEXT: s_add_u32 s11, s10, 0x80000000 +; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX11-NEXT: s_add_u32 s0, s4, s12 @@ -6299,7 +6293,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-NEXT: v_readfirstlane_b32 s1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index 1061f00..2c2f8e9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -279,125 +279,27 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) { ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x1000 +; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1 ; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x1000, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i32 %num, 4096 ret i32 %result } define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { -; GISEL-LABEL: v_sdiv_v2i32_pow2k_denom: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; GISEL-NEXT: v_mov_b32_e32 v3, 0x1000 -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_mov_b32_e32 v5, 0xfffff000 -; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v7, 12, v5 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 -; GISEL-NEXT: v_lshlrev_b32_e32 v9, 12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v0, v3 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] -; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc -; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 -; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; GISEL-NEXT: s_setpc_b64 s[30:31] -; -; CGP-LABEL: v_sdiv_v2i32_pow2k_denom: -; CGP: ; %bb.0: -; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v4 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_lshlrev_b32_e32 v9, 12, v3 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v5 -; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[6:7] -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: s_setpc_b64 s[30:31] +; CHECK-LABEL: v_sdiv_v2i32_pow2k_denom: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_lshrrev_b32_e32 v3, 20, v3 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 12, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i32> %num, <i32 4096, i32 4096> ret <2 x i32> %result } @@ -884,3 +786,24 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) { %result = sdiv <2 x i32> %num.mask, %den.mask ret <2 x i32> %result } + +define i32 @v_sdiv_i32_exact(i32 %num) { +; CHECK-LABEL: v_sdiv_i32_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact i32 %num, 4096 + ret i32 %result +} + +define <2 x i32> @v_sdiv_v2i32_exact(<2 x i32> %num) { +; CHECK-LABEL: v_sdiv_v2i32_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 10, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i32> %num, <i32 4096, i32 1024> + ret <2 x i32> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 0a6b7af..377fa24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -999,126 +999,11 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) { ; CHECK-LABEL: v_sdiv_i64_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000 -; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0 -; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000 -; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; CHECK-NEXT: v_trunc_f32_e32 v4, v3 -; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2 -; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4 -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3 -; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3 -; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2 -; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc -; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4] -; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4] -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc -; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6 -; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2 -; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3 -; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6 -; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2 -; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8 -; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc -; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0 -; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1 -; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0 -; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2 -; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1 -; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0 -; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2 -; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2] -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0 -; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc -; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5] -; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7 -; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4 -; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6 -; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 -; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %num, 4096 ret i64 %result @@ -1128,473 +1013,31 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] -; GISEL-NEXT: v_mul_hi_u32 v9, v6, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v7, v4 -; GISEL-NEXT: v_mul_lo_u32 v10, v6, v8 -; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v4 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GISEL-NEXT: v_mov_b32_e32 v4, v9 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5] ; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; GISEL-NEXT: v_lshrrev_b32_e32 v4, 20, v4 +; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10] -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, v0, v4 -; GISEL-NEXT: v_mul_lo_u32 v0, v5, v8 -; GISEL-NEXT: v_mul_lo_u32 v12, v11, v9 -; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4 -; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0 -; GISEL-NEXT: v_mul_hi_u32 v12, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_mov_b32_e32 v5, 0x1000 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v8 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v0 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9] -; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000 -; GISEL-NEXT: s_subb_u32 s7, 0, 0 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v13, v8, vcc -; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v13, v8 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v9, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v11 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0 -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v12, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2] -; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13 -; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9] -; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc -; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0 -; GISEL-NEXT: v_mul_lo_u32 v13, v6, v8 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13 -; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v6, v0 -; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0 -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GISEL-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2] -; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4 -; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7] -; GISEL-NEXT: v_cndmask_b32_e32 v10, v12, v14, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc -; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9 -; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6 -; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc -; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0 -; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2 -; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0 -; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2 -; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v7, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v3 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1] -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4 -; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc -; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7] -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2 -; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc -; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10 -; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6 -; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9 -; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9 -; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; GISEL-NEXT: v_lshrrev_b32_e32 v5, 20, v5 +; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; GISEL-NEXT: v_ashr_i64 v[2:3], v[2:3], 12 ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000 -; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0 -; CGP-NEXT: v_mov_b32_e32 v6, 0xfffff000 -; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4 -; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; CGP-NEXT: v_trunc_f32_e32 v7, v5 -; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4 -; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7 -; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0 -; CGP-NEXT: v_mov_b32_e32 v7, v5 -; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8] -; CGP-NEXT: v_mul_hi_u32 v12, v9, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11] -; CGP-NEXT: v_mul_lo_u32 v10, v9, v4 -; CGP-NEXT: v_mul_hi_u32 v11, v8, v4 -; CGP-NEXT: v_mul_lo_u32 v4, v8, v13 -; CGP-NEXT: v_mul_lo_u32 v7, v9, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v8, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v9, v13 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12 -; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14 -; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0 -; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc -; CGP-NEXT: v_mov_b32_e32 v4, v14 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5] -; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15] -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_xor_b32_e32 v15, v0, v7 -; CGP-NEXT: v_mul_lo_u32 v0, v17, v13 -; CGP-NEXT: v_mul_lo_u32 v4, v16, v14 -; CGP-NEXT: v_xor_b32_e32 v18, v1, v7 -; CGP-NEXT: v_mul_hi_u32 v1, v16, v13 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v13 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1 +; CGP-NEXT: v_lshrrev_b32_e32 v4, 20, v4 ; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v1, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; CGP-NEXT: v_mul_hi_u32 v4, v16, v14 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; CGP-NEXT: v_mul_hi_u32 v13, v17, v14 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc -; CGP-NEXT: v_mul_lo_u32 v13, v18, v0 -; CGP-NEXT: v_mul_lo_u32 v14, v15, v1 -; CGP-NEXT: v_mul_hi_u32 v16, v15, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v18, v0 -; CGP-NEXT: v_mov_b32_e32 v4, 0x1000 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v16, v18, v1 -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_mul_hi_u32 v14, v15, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0 -; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14 -; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13 -; CGP-NEXT: v_mul_hi_u32 v17, v18, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0 -; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13 -; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v13 -; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2] -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0 -; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc -; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 -; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 -; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc -; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v16 -; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CGP-NEXT: v_mov_b32_e32 v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5] -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1] -; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1] -; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v15 -; CGP-NEXT: v_mul_lo_u32 v19, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v1, vcc -; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11 -; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v11, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_mul_hi_u32 v10, v8, v0 -; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0 -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 -; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; CGP-NEXT: v_xor_b32_e32 v11, v5, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2] -; CGP-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc -; CGP-NEXT: v_xor_b32_e32 v1, v10, v7 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6] -; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc -; CGP-NEXT: v_xor_b32_e32 v12, v2, v10 -; CGP-NEXT: v_mul_lo_u32 v2, v9, v0 -; CGP-NEXT: v_mul_lo_u32 v6, v8, v5 -; CGP-NEXT: v_xor_b32_e32 v13, v3, v10 -; CGP-NEXT: v_mul_hi_u32 v3, v8, v0 -; CGP-NEXT: v_mul_hi_u32 v0, v9, v0 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v3, v9, v5 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2 -; CGP-NEXT: v_mul_hi_u32 v6, v8, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_mul_hi_u32 v5, v9, v5 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0 -; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc -; CGP-NEXT: v_mul_lo_u32 v5, v13, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v12, v2 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7 -; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc -; CGP-NEXT: v_mul_hi_u32 v7, v12, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_mul_lo_u32 v7, v13, v2 -; CGP-NEXT: v_mul_hi_u32 v3, v13, v3 -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_mul_hi_u32 v6, v12, v2 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5 -; CGP-NEXT: v_mul_hi_u32 v8, v13, v2 -; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0 -; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5 -; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4] -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2 -; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5 -; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 -; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc -; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5] -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7 -; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc -; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6 -; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc -; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc -; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v2, v2, v10 -; CGP-NEXT: v_xor_b32_e32 v3, v3, v10 -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CGP-NEXT: v_lshrrev_b32_e32 v4, 20, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; CGP-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CGP-NEXT: v_ashr_i64 v[2:3], v[2:3], 12 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = sdiv <2 x i64> %num, <i64 4096, i64 4096> ret <2 x i64> %result @@ -3091,253 +2534,252 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0, v1 -; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6] -; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v0, v[5:6] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9] +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v11, v7 +; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 0, v2 -; GISEL-NEXT: v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4 -; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v2 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1 -; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc -; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v13 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v3 -; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v5, vcc -; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 -; GISEL-NEXT: v_trunc_f32_e32 v6, v6 -; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v2 +; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v2 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v11 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5] +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v6, v1 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc +; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 +; GISEL-NEXT: v_trunc_f32_e32 v5, v5 +; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v4 +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v2 ; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5 -; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0 -; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7] -; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v9 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7] -; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v0, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3 -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5 -; GISEL-NEXT: v_mul_lo_u32 v10, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v1 -; GISEL-NEXT: v_mul_hi_u32 v1, v13, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v11, 0 +; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, v[5:6] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v10 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v11, v[5:6] +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v0, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v13, v4 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v1, vcc +; GISEL-NEXT: v_mul_hi_u32 v1, v11, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v14, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1 -; GISEL-NEXT: v_mul_hi_u32 v10, v13, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 -; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6 -; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v1 -; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v14, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v17 -; GISEL-NEXT: v_mov_b32_e32 v1, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2] -; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v13, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v18, v14, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; GISEL-NEXT: v_mul_lo_u32 v7, v11, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v10, v6 -; GISEL-NEXT: v_mul_hi_u32 v13, v10, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 -; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13 +; GISEL-NEXT: v_mul_lo_u32 v6, v13, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5 +; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v1 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v13, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v8, 0 +; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v11, v[1:2] +; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v8, v[5:6] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v16, v13, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v8, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], 0, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4 +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_mul_hi_u32 v7, v8, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5 -; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v6 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4 +; GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], v11, v5, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v10, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v9, vcc +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v5, v0 -; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1] +; GISEL-NEXT: v_mul_hi_u32 v7, v10, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v0 +; GISEL-NEXT: v_mul_hi_u32 v9, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v9, v5 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2 -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2 -; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v9, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v8 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v4, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v7 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v6, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8 +; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] @@ -3399,3 +2841,24 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { %result = sdiv <2 x i64> %num.mask, %den.mask ret <2 x i64> %result } + +define i64 @v_sdiv_i64_exact(i64 %num) { +; CHECK-LABEL: v_sdiv_i64_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact i64 %num, 4096 + ret i64 %result +} + +define <2 x i64> @v_sdiv_v2i64_exact(<2 x i64> %num) { +; CHECK-LABEL: v_sdiv_v2i64_exact: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; CHECK-NEXT: v_ashr_i64 v[2:3], v[2:3], 10 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i64> %num, <i64 4096, i64 1024> + ret <2 x i64> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index c455b24..83ebc84 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3034,253 +3034,251 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0 ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1 -; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1 -; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1 -; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4 +; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1 +; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1 +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4 -; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5 +; GISEL-NEXT: v_trunc_f32_e32 v9, v7 +; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5 +; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4 -; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6] -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8] -; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7 -; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5 +; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0 +; GISEL-NEXT: v_mov_b32_e32 v5, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6] +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9] +; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7 +; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8 +; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 -; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4 +; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8 ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 -; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc -; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0 -; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4 -; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0 -; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0 -; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2 +; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 +; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4 ; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0 +; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0 +; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5 +; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5 -; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4 -; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0 +; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, v5 -; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1] ; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v4 -; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v5, vcc -; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v7 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9] +; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0 -; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, s[4:5] ; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2 -; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 -; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v5, v3, vcc -; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v10 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v9, v6, v9, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v5, v3, vcc +; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4 ; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v7, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v8, v1 +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v1 +; GISEL-NEXT: v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc ; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0 -; GISEL-NEXT: v_trunc_f32_e32 v7, v5 -; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7 -; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v0 -; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v13, vcc -; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v2 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v4, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, 0 -; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v7 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v18, v[0:1] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1 +; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0 +; GISEL-NEXT: v_trunc_f32_e32 v6, v4 +; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6 +; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0 +; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v2 +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v6 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1] +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v15, v[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v19, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v0, v18, v5 -; GISEL-NEXT: v_mul_lo_u32 v19, v15, v6 -; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v13, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v13, v15, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v0, v17, v4 +; GISEL-NEXT: v_mul_lo_u32 v18, v14, v5 +; GISEL-NEXT: v_mul_hi_u32 v19, v14, v4 +; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v3, vcc +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19 -; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v13, v18, v6 -; GISEL-NEXT: v_mul_hi_u32 v5, v18, v5 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0 -; GISEL-NEXT: v_mul_hi_u32 v19, v15, v6 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v19 +; GISEL-NEXT: v_mul_lo_u32 v19, v17, v5 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0 +; GISEL-NEXT: v_mul_hi_u32 v18, v14, v5 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v19, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19 -; GISEL-NEXT: v_mul_hi_u32 v6, v18, v6 -; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0 -; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0 -; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v5, vcc -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v13, 0 -; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v10, v1 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v15, v[0:1] -; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1] -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v18, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc -; GISEL-NEXT: v_mul_lo_u32 v6, v15, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v13, v0 -; GISEL-NEXT: v_mul_hi_u32 v11, v13, v5 -; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], 0, v12 -; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0 -; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5 -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v13, v0 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; GISEL-NEXT: v_mul_hi_u32 v5, v17, v5 +; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v18, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v0 +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0 +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v11, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1] +; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1] +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GISEL-NEXT: v_mul_lo_u32 v5, v17, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v14, v0 +; GISEL-NEXT: v_mul_hi_u32 v10, v14, v4 +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12 ; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7 -; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6 -; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v6 -; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5 -; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v15, v0, s[4:5] -; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5 -; GISEL-NEXT: v_mul_lo_u32 v7, v8, v0 -; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v10, v17, v0 +; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4 +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7 +; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5 +; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v5 +; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v14, v4 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5] +; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v9, v0 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc +; GISEL-NEXT: v_mul_hi_u32 v6, v9, v4 +; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GISEL-NEXT: v_mul_lo_u32 v6, v3, v0 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v9, v0 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc -; GISEL-NEXT: v_mul_lo_u32 v9, v10, v0 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GISEL-NEXT: v_mul_hi_u32 v7, v8, v0 -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6 -; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0 -; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, 0 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v7 -; GISEL-NEXT: v_mov_b32_e32 v0, v6 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, v[0:1] +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0 +; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10 +; GISEL-NEXT: v_add_i32_e32 v6, vcc, v0, v6 +; GISEL-NEXT: v_mov_b32_e32 v0, v5 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1] ; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1 -; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v9, v[6:7] -; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v8, v5 -; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v6, vcc -; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v10, v6 -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4 +; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6] +; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v9, v4 +; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc +; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v4 -; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3 +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v3, v2 -; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc -; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v2 +; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3 ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5] ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4 -; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v6, v4, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3 +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc ; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] -; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc +; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 61e1e67..320dfbb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4142,11 +4142,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4162,7 +4162,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4179,7 +4179,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX6-NEXT: s_ashr_i32 s2, s7, 31 ; GFX6-NEXT: s_ashr_i32 s5, s7, 15 -; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX6-NEXT: s_addk_i32 s2, 0x8000 ; GFX6-NEXT: v_mov_b32_e32 v0, s5 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX8-NEXT: s_ashr_i32 s2, s7, 31 ; GFX8-NEXT: s_ashr_i32 s5, s7, 15 -; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX9-LABEL: ssubsat_i48_vs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1] @@ -4529,11 +4529,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4546,7 +4546,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo @@ -4560,7 +4560,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX6-NEXT: s_ashr_i32 s2, s5, 31 -; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 @@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX8-NEXT: s_ashr_i32 s2, s5, 31 -; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 @@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 -; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 @@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: s_ashr_i32 s2, s5, 31 -; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX10-NEXT: s_xor_b32 s0, s1, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1] ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0 ; GFX11-NEXT: s_ashr_i32 s2, s5, 31 -; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000 ; GFX11-NEXT: s_xor_b32 s0, s1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 @@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1] +; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0 ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0 -; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog @@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0 ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-NEXT: ; return to shader part epilog @@ -4866,21 +4866,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5] ; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2 -; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5] +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 +; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -4896,10 +4895,10 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1] ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5] ; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] -; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12 ; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3] -; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7] +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo @@ -4921,8 +4920,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11 ; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3] ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0, v[6:7] -; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo ; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1 @@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 @@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 @@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 @@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 @@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 @@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 @@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 ; GFX10-NEXT: s_ashr_i32 s4, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s8, s1, s0 ; GFX10-NEXT: s_sub_u32 s0, s2, s6 ; GFX10-NEXT: s_subb_u32 s1, s3, s7 @@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX10-NEXT: s_ashr_i32 s4, s1, 31 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: s_xor_b32 s1, s3, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1] ; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0 ; GFX11-NEXT: s_ashr_i32 s4, s9, 31 -; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s8, s1, s0 ; GFX11-NEXT: s_sub_u32 s0, s2, s6 ; GFX11-NEXT: s_subb_u32 s1, s3, s7 @@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8 -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: s_xor_b32 s1, s3, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1 @@ -5134,7 +5133,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s11, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 @@ -5183,7 +5182,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s11, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 @@ -5232,7 +5231,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s11, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 @@ -5274,7 +5273,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, s9 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 @@ -5317,7 +5316,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9 ; GFX11-NEXT: v_mov_b32_e32 v3, s11 ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 @@ -5427,9 +5426,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -5456,7 +5454,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5484,8 +5482,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { ; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2 ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 @@ -5594,9 +5591,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -5625,7 +5621,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo ; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5652,12 +5648,12 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { ; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0 ; GFX11-NEXT: s_and_b32 s0, 1, s4 -; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo ; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 @@ -5805,9 +5801,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19 -; GFX9-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc @@ -5831,8 +5826,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4 ; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6 ; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc @@ -5877,18 +5872,18 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v19 -; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo -; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v3, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v5, 1, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v19 +; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4 @@ -5931,18 +5926,16 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { ; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15] -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v19 +; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6 +; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3 -; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v3 :: v_dual_and_b32 v5, 1, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0 @@ -5978,7 +5971,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s0, s19, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 @@ -6013,7 +6006,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 @@ -6066,7 +6059,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s0, s19, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 @@ -6107,7 +6100,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_ashr_i32 s4, s3, 31 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 @@ -6160,7 +6153,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s0, s19, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000 +; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 @@ -6201,7 +6194,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000 +; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 @@ -6244,7 +6237,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s1, 1, 0 ; GFX10-NEXT: s_ashr_i32 s8, s17, 31 ; GFX10-NEXT: s_and_b32 s1, 1, s1 -; GFX10-NEXT: s_add_u32 s9, s8, 0x80000000 +; GFX10-NEXT: s_add_i32 s9, s8, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 @@ -6273,7 +6266,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 ; GFX10-NEXT: s_ashr_i32 s4, s3, 31 ; GFX10-NEXT: s_and_b32 s5, 1, s5 -; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 @@ -6326,7 +6319,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: s_cselect_b32 s1, 1, 0 ; GFX11-NEXT: s_ashr_i32 s8, s19, 31 ; GFX11-NEXT: s_and_b32 s1, 1, s1 -; GFX11-NEXT: s_add_u32 s9, s8, 0x80000000 +; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000 ; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 @@ -6357,7 +6350,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000 +; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, s18 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index 887c43f..d155513 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2062,13 +2062,9 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 @@ -2077,10 +2073,6 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 5c6bb6d..07480a0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2480,13 +2480,9 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5 ; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5 ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 @@ -2495,10 +2491,6 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10 -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11 ; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10 diff --git a/llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir b/llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir new file mode 100644 index 0000000..cba114c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir @@ -0,0 +1,68 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=finalize-isel -o - %s | FileCheck -check-prefix=GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=finalize-isel -o - %s | FileCheck -check-prefix=GFX12 %s + +--- +name: reg_ops +tracksRegLiveness: true +body: | + bb.0: + ; GFX11-LABEL: name: reg_ops + ; GFX11: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[DEF1]].sub0 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[DEF1]].sub1 + ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY2]], implicit-def $scc + ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY1]], [[COPY3]], implicit-def $scc, implicit $scc + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; + ; GFX12-LABEL: name: reg_ops + ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_ADD_U64_:%[0-9]+]]:sreg_64 = S_ADD_U64 [[DEF]], [[DEF1]] + %0:sreg_64 = IMPLICIT_DEF + %1:sreg_64 = IMPLICIT_DEF + %2:sreg_64 = S_ADD_U64_PSEUDO %0, %1, implicit-def $scc +... + +--- +name: lhs_imm +tracksRegLiveness: true +body: | + bb.0: + ; GFX11-LABEL: name: lhs_imm + ; GFX11: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1 + ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 6565, [[COPY]], implicit-def $scc + ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 0, [[COPY1]], implicit-def $scc, implicit $scc + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; + ; GFX12-LABEL: name: lhs_imm + ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_ADD_U64_:%[0-9]+]]:sreg_64 = S_ADD_U64 6565, [[DEF]] + %0:sreg_64 = IMPLICIT_DEF + %1:sreg_64 = S_ADD_U64_PSEUDO 6565, %0, implicit-def $scc +... + +--- +name: rhs_imm +tracksRegLiveness: true +body: | + bb.0: + ; GFX11-LABEL: name: rhs_imm + ; GFX11: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1 + ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 6565, implicit-def $scc + ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY1]], 0, implicit-def $scc, implicit $scc + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1 + ; + ; GFX12-LABEL: name: rhs_imm + ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; GFX12-NEXT: [[S_ADD_U64_:%[0-9]+]]:sreg_64 = S_ADD_U64 [[DEF]], 6565 + %0:sreg_64 = IMPLICIT_DEF + %1:sreg_64 = S_ADD_U64_PSEUDO %0, 6565, implicit-def $scc +... diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll index 66034af..cff9ce0 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll @@ -233,9 +233,9 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/allow-check.ll b/llvm/test/CodeGen/AMDGPU/allow-check.ll new file mode 100644 index 0000000..d4f5621 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/allow-check.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=0 | FileCheck %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=1 -fast-isel=0 | FileCheck %s +; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=1 | FileCheck %s + +define i1 @test_runtime() local_unnamed_addr { +; CHECK-LABEL: test_runtime: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %allow = call i1 @llvm.allow.runtime.check(metadata !"test_check") + ret i1 %allow +} + +declare i1 @llvm.allow.runtime.check(metadata) nounwind + +define i1 @test_ubsan() local_unnamed_addr { +; CHECK-LABEL: test_ubsan: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %allow = call i1 @llvm.allow.ubsan.check(i8 7) + ret i1 %allow +} + +declare i1 @llvm.allow.ubsan.check(i8) nounwind diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll new file mode 100644 index 0000000..33b1cc6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll @@ -0,0 +1,255 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s + +define amdgpu_kernel void @kernel_uses_asm_virtreg() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "a"(i32 poison) + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_def() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[DEF:%.*]] = call i32 asm sideeffect " +; CHECK-NEXT: ret void +; + %def = call i32 asm sideeffect "; def $0", "=a"() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[DEF:%.*]] = call i64 asm sideeffect " +; CHECK-NEXT: ret void +; + %def = call i64 asm sideeffect "; def $0", "={a[0:1]}"() + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "v,a"(i32 poison, i32 poison) + ret void +} + +define amdgpu_kernel void @kernel_uses_non_agpr_asm() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_non_agpr_asm( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "v"(i32 poison) + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_physreg() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "{a0}"(i32 poison) + ret void +} + +define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_tuple( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison) + ret void +} + +define void @func_uses_asm_virtreg_agpr() { +; CHECK-LABEL: define void @func_uses_asm_virtreg_agpr( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "a"(i32 poison) + ret void +} + +define void @func_uses_asm_physreg_agpr() { +; CHECK-LABEL: define void @func_uses_asm_physreg_agpr( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "{a0}"(i32 poison) + ret void +} + +define void @func_uses_asm_physreg_agpr_tuple() { +; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK-NEXT: call void asm sideeffect " +; CHECK-NEXT: ret void +; + call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison) + ret void +} + +declare void @unknown() + +define amdgpu_kernel void @kernel_calls_extern() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern( +; CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: ret void +; + call void @unknown() + ret void +} + +define amdgpu_kernel void @kernel_calls_extern_marked_callsite() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite( +; CHECK-SAME: ) #[[ATTR4]] { +; CHECK-NEXT: call void @unknown() #[[ATTR9:[0-9]+]] +; CHECK-NEXT: ret void +; + call void @unknown() #0 + ret void +} + +define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect( +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] { +; CHECK-NEXT: call void [[INDIRECT]]() +; CHECK-NEXT: ret void +; + call void %indirect() + ret void +} + +define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite( +; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] { +; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR9]] +; CHECK-NEXT: ret void +; + call void %indirect() #0 + ret void +} + +define amdgpu_kernel void @kernel_transitively_uses_agpr_asm() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_transitively_uses_agpr_asm( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @func_uses_asm_physreg_agpr() +; CHECK-NEXT: ret void +; + call void @func_uses_asm_physreg_agpr() + ret void +} + +define void @empty() { +; CHECK-LABEL: define void @empty( +; CHECK-SAME: ) #[[ATTR5:[0-9]+]] { +; CHECK-NEXT: ret void +; + ret void +} + +define void @also_empty() { +; CHECK-LABEL: define void @also_empty( +; CHECK-SAME: ) #[[ATTR5]] { +; CHECK-NEXT: ret void +; + ret void +} + +define amdgpu_kernel void @kernel_calls_empty() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_empty( +; CHECK-SAME: ) #[[ATTR1]] { +; CHECK-NEXT: call void @empty() +; CHECK-NEXT: ret void +; + call void @empty() + ret void +} + +define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr() { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: call void @empty() +; CHECK-NEXT: call void @func_uses_asm_physreg_agpr() +; CHECK-NEXT: ret void +; + call void @empty() + call void @func_uses_asm_physreg_agpr() + ret void +} + +define amdgpu_kernel void @kernel_calls_generic_intrinsic(ptr %ptr0, ptr %ptr1, i64 %size) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_generic_intrinsic( +; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[PTR0]], ptr [[PTR1]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: ret void +; + call void @llvm.memcpy.p0.p0.i64(ptr %ptr0, ptr %ptr1, i64 %size, i1 false) + ret void +} + +declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg) + +define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]], <32 x float> [[C:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[RESULT:%.*]] = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float [[A]], float [[B]], <32 x float> [[C]], i32 0, i32 0, i32 0) +; CHECK-NEXT: store <32 x float> [[RESULT]], ptr addrspace(1) [[OUT]], align 128 +; CHECK-NEXT: ret void +; + %result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0) + store <32 x float> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) { +; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_workitem_id_x( +; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4 +; CHECK-NEXT: ret void +; + %result = call i32 @llvm.amdgcn.workitem.id.x() + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr( +; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty +; CHECK-NEXT: call void [[FPTR]]() +; CHECK-NEXT: ret void +; + %fptr = select i1 %cond, ptr @empty, ptr @also_empty + call void %fptr() + ret void +} + + +attributes #0 = { "amdgpu-no-agpr" } +;. +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll index 192bf7c..93b9aea 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll @@ -1197,3 +1197,54 @@ reallyfinally: store <5 x double> %val, ptr %out, align 1 ret void } + +define amdgpu_kernel void @pr85718(i1 %Bool, ptr %Ptr, <4 x float> %Vec1, <4 x float> %Vec2) { +; OPT-LABEL: @pr85718( +; OPT-NEXT: BB0: +; OPT-NEXT: [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true +; OPT-NEXT: br label [[BB1:%.*]] +; OPT: BB1: +; OPT-NEXT: [[TMP0:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE0:%.*]], [[BB2:%.*]] ], [ [[LARGEPHI_EXTRACTSLICE1:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0:%.*]] ] +; OPT-NEXT: [[TMP1:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE3:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE4:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ] +; OPT-NEXT: [[TMP2:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE6:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE7:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ] +; OPT-NEXT: [[TMP3:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE9:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE10:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ] +; OPT-NEXT: [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; OPT-NEXT: [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE0]], float [[TMP1]], i64 1 +; OPT-NEXT: [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE1]], float [[TMP2]], i64 2 +; OPT-NEXT: [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE2]], float [[TMP3]], i64 3 +; OPT-NEXT: store <4 x float> [[LARGEPHI_INSERTSLICE3]], ptr [[PTR:%.*]], align 128 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE1]] = extractelement <4 x float> [[VEC2:%.*]], i64 0 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE4]] = extractelement <4 x float> [[VEC2]], i64 1 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE7]] = extractelement <4 x float> [[VEC2]], i64 2 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE10]] = extractelement <4 x float> [[VEC2]], i64 3 +; OPT-NEXT: br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]] +; OPT: BB2: +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE0]] = extractelement <4 x float> [[I]], i64 0 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE3]] = extractelement <4 x float> [[I]], i64 1 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE6]] = extractelement <4 x float> [[I]], i64 2 +; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE9]] = extractelement <4 x float> [[I]], i64 3 +; OPT-NEXT: br label [[BB1]] +; +; NOOPT-LABEL: @pr85718( +; NOOPT-NEXT: BB0: +; NOOPT-NEXT: [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true +; NOOPT-NEXT: br label [[BB1:%.*]] +; NOOPT: BB1: +; NOOPT-NEXT: [[PHI:%.*]] = phi <4 x float> [ [[I]], [[BB2:%.*]] ], [ [[VEC2:%.*]], [[BB1]] ], [ zeroinitializer, [[BB0:%.*]] ] +; NOOPT-NEXT: store <4 x float> [[PHI]], ptr [[PTR:%.*]], align 128 +; NOOPT-NEXT: br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]] +; NOOPT: BB2: +; NOOPT-NEXT: br label [[BB1]] +; +BB0: + %I = insertelement <4 x float> %Vec1, float 4.200000e+01, i1 true + br label %BB1 + +BB1: ; preds = %BB0, %BB1, %BB2 + %PHI = phi <4 x float> [ %I, %BB2 ], [ %Vec2, %BB1 ], [ zeroinitializer, %BB0 ] + store <4 x float> %PHI, ptr %Ptr, align 128 + br i1 %Bool, label %BB1, label %BB2 + +BB2: ; preds = %BB1 + br label %BB1 +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index d900165..2ad28b8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -10668,3 +10668,111 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x store <2 x i64> %r, ptr addrspace(1) %out ret void } + +define <2 x i32> @v_sdiv_i32_exact(<2 x i32> %num) { +; CHECK-LABEL: @v_sdiv_i32_exact( +; CHECK: %1 = extractelement <2 x i32> %num, i64 0 +; CHECK-NEXT: %2 = sdiv exact i32 %1, 4096 +; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0 +; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1 +; CHECK-NEXT: %5 = sdiv exact i32 %4, 1024 +; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1 +; CHECK-NEXT: ret <2 x i32> %6 +; +; GFX6-LABEL: v_sdiv_i32_exact: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v1, 10, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sdiv_i32_exact: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 10, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i32> %num, <i32 4096, i32 1024> + ret <2 x i32> %result +} + +define <2 x i64> @v_sdiv_i64_exact(<2 x i64> %num) { +; CHECK-LABEL: @v_sdiv_i64_exact( +; CHECK: %1 = extractelement <2 x i64> %num, i64 0 +; CHECK-NEXT: %2 = sdiv exact i64 %1, 4096 +; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0 +; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1 +; CHECK-NEXT: %5 = sdiv exact i64 %4, 1024 +; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1 +; CHECK-NEXT: ret <2 x i64> %6 +; +; GFX6-LABEL: v_sdiv_i64_exact: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_ashr_i64 v[0:1], v[0:1], 12 +; GFX6-NEXT: v_ashr_i64 v[2:3], v[2:3], 10 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_sdiv_i64_exact: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_ashrrev_i64 v[0:1], 12, v[0:1] +; GFX9-NEXT: v_ashrrev_i64 v[2:3], 10, v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = sdiv exact <2 x i64> %num, <i64 4096, i64 1024> + ret <2 x i64> %result +} + +define <2 x i32> @v_udiv_i32_exact(<2 x i32> %num) { +; CHECK-LABEL: @v_udiv_i32_exact( +; CHECK: %1 = extractelement <2 x i32> %num, i64 0 +; CHECK-NEXT: %2 = udiv exact i32 %1, 4096 +; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0 +; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1 +; CHECK-NEXT: %5 = udiv exact i32 %4, 1024 +; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1 +; CHECK-NEXT: ret <2 x i32> %6 +; +; GFX6-LABEL: v_udiv_i32_exact: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshrrev_b32_e32 v0, 12, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 10, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_udiv_i32_exact: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 12, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = udiv exact <2 x i32> %num, <i32 4096, i32 1024> + ret <2 x i32> %result +} + +define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) { +; CHECK-LABEL: @v_udiv_i64_exact( +; CHECK: %1 = extractelement <2 x i64> %num, i64 0 +; CHECK-NEXT: %2 = udiv exact i64 %1, 4096 +; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0 +; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1 +; CHECK-NEXT: %5 = udiv exact i64 %4, 1024 +; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1 +; CHECK-NEXT: ret <2 x i64> %6 +; +; GFX6-LABEL: v_udiv_i64_exact: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 12 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 10 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_udiv_i64_exact: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 12, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], 10, v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %result = udiv exact <2 x i64> %num, <i64 4096, i64 1024> + ret <2 x i64> %result +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll index 942f459..8ddaf24 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll @@ -808,7 +808,7 @@ define float @test_pown_fast_f32_nobuiltin(float %x, i32 %y) { ; CHECK-LABEL: define float @test_pown_fast_f32_nobuiltin ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR3:[0-9]+]] +; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: ret float [[CALL]] ; entry: @@ -820,11 +820,11 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 { ; CHECK-LABEL: define float @test_pown_fast_f32_strictfp ; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) -; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) -; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float -; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]] -; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) +; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) #[[ATTR0]] +; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) #[[ATTR0]] +; CHECK-NEXT: [[POWNI2F:%.*]] = call fast float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[Y]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]] +; CHECK-NEXT: [[__YLOGX:%.*]] = call fast float @llvm.experimental.constrained.fmul.f32(float [[POWNI2F]], float [[__LOG2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]] +; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) #[[ATTR0]] ; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32 ; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll index 2ffa647..2e64a34 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll @@ -896,7 +896,7 @@ define float @test_rootn_f32__y_neg2__strictfp(float %x) #1 { ; CHECK-LABEL: define float @test_rootn_f32__y_neg2__strictfp( ; CHECK-SAME: float [[X:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) +; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) #[[ATTR0]] ; CHECK-NEXT: ret float [[__ROOTN2RSQRT]] ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index af0eb23..3d4ae84d9 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -1025,33 +1025,33 @@ attributes #6 = { "enqueued-block" } ; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind } ; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index 9a9c28a..43cdf85 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -643,19 +643,19 @@ attributes #1 = { nounwind } ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" } ;. ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500} ;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll index 6c5e58c..547ff69 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll @@ -393,17 +393,18 @@ define amdgpu_kernel void @use_get_local_size_z(ptr addrspace(1) %ptr) #1 { attributes #0 = { nounwind readnone } attributes #1 = { nounwind } +;. ; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ; AKF_CHECK: attributes #[[ATTR1]] = { nounwind } ;. ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } -; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } +; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 1ebd864..2970495 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -477,7 +477,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1032-NEXT: s_cbranch_execz .LBB1_3 ; GFX1032-NEXT: ; %bb.2: ; GFX1032-NEXT: v_mov_b32_e32 v0, s11 -; GFX1032-NEXT: s_mov_b32 s10, s11 ; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1032-NEXT: .LBB1_3: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 @@ -615,7 +614,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: s_cbranch_execz .LBB1_3 ; GFX1132-NEXT: ; %bb.2: ; GFX1132-NEXT: v_mov_b32_e32 v0, s11 -; GFX1132-NEXT: s_mov_b32 s10, s11 ; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc ; GFX1132-NEXT: .LBB1_3: ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9 diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir index c1da29e..3228962 100644 --- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir +++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir @@ -14,6 +14,8 @@ --- name: test_av_spill_cross_bb_usage tracksRegLiveness: true +frameInfo: + adjustsStack: true stack: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } machineFunctionInfo: diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll new file mode 100644 index 0000000..7108f3d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -0,0 +1,357 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s + +; TODO: Add global-isel when it can support bf16 + +define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) { +; GCN-LABEL: v_test_cvt_bf16_f32_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: ; return to shader part epilog + %cvt = fpext bfloat %v to float + ret float %cvt +} + +define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) { +; GCN-LABEL: v_test_cvt_bf16_f32_s: +; GCN: ; %bb.0: +; GCN-NEXT: s_lshl_b32 s0, s0, 16 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog + %cvt = fpext bfloat %v to float + ret float %cvt +} + +define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { +; GCN-LABEL: v_test_cvt_v2f32_v2bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v2, v2, v0, s0 +; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GCN-NEXT: v_add3_u32 v2, v2, v1, s0 +; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GCN-NEXT: s_mov_b32 s0, 0x7060302 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_perm_b32 v0, v1, v0, s0 +; GCN-NEXT: ; return to shader part epilog + %res = fptrunc <2 x float> %src to <2 x bfloat> + %cast = bitcast <2 x bfloat> %res to float + ret float %cast +} + +define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { +; GCN-LABEL: v_test_cvt_v2f32_v2bf16_s: +; GCN: ; %bb.0: +; GCN-NEXT: s_bfe_u32 s2, s1, 0x10010 +; GCN-NEXT: s_add_i32 s2, s2, s1 +; GCN-NEXT: s_or_b32 s4, s1, 0x400000 +; GCN-NEXT: s_add_i32 s5, s2, 0x7fff +; GCN-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1 +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s2, s4, s5 +; GCN-NEXT: s_bfe_u32 s1, s0, 0x10010 +; GCN-NEXT: s_add_i32 s1, s1, s0 +; GCN-NEXT: s_or_b32 s3, s0, 0x400000 +; GCN-NEXT: s_add_i32 s4, s1, 0x7fff +; GCN-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s3, s4 +; GCN-NEXT: s_pack_hh_b32_b16 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog + %res = fptrunc <2 x float> %src to <2 x bfloat> + %cast = bitcast <2 x bfloat> %res to float + ret float %cast +} + +define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { +; GCN-LABEL: v_test_cvt_f32_bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v1, v1, v0, s0 +; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: ; return to shader part epilog + %trunc = fptrunc float %src to bfloat + %ext = fpext bfloat %trunc to float + ret float %ext +} + +define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { +; GCN-LABEL: v_test_cvt_v2f64_v2bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GCN-NEXT: v_and_b32_e32 v7, 1, v6 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v4, v6, v4 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: s_brev_b32 s4, 1 +; GCN-NEXT: v_and_or_b32 v5, v1, s4, v4 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GCN-NEXT: s_movk_i32 s5, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v5, s5 +; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc +; GCN-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 +; GCN-NEXT: v_and_b32_e32 v6, 1, v5 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v0, v5, v0 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GCN-NEXT: v_and_or_b32 v1, v3, s4, v0 +; GCN-NEXT: v_bfe_u32 v0, v0, 16, 1 +; GCN-NEXT: v_add3_u32 v0, v0, v1, s5 +; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] +; GCN-NEXT: s_mov_b32 s0, 0x7060302 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_perm_b32 v0, v0, v4, s0 +; GCN-NEXT: ; return to shader part epilog + %res = fptrunc <2 x double> %src to <2 x bfloat> + %cast = bitcast <2 x bfloat> %res to float + ret float %cast +} + +define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) { +; GCN-LABEL: fptrunc_f32_f32_to_v2bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v2, v2, v0, s0 +; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GCN-NEXT: v_add3_u32 v2, v2, v1, s0 +; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GCN-NEXT: s_mov_b32 s0, 0x7060302 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_perm_b32 v0, v1, v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %a.cvt = fptrunc float %a to bfloat + %b.cvt = fptrunc float %b to bfloat + %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0 + %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1 + %ret = bitcast <2 x bfloat> %v2.2 to float + ret float %ret +} + +define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) { +; GCN-LABEL: fptrunc_f32_f32_to_v2bf16_mods: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0 +; GCN-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v3, v3, v2, s0 +; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1 +; GCN-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GCN-NEXT: v_add3_u32 v3, v3, v2, s0 +; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v1|, |v1| +; GCN-NEXT: s_mov_b32 s0, 0x7060302 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GCN-NEXT: v_perm_b32 v0, v1, v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %a.neg = fneg float %a + %a.cvt = fptrunc float %a.neg to bfloat + %b.abs = call float @llvm.fabs.f32(float %b) + %b.cvt = fptrunc float %b.abs to bfloat + %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0 + %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1 + %ret = bitcast <2 x bfloat> %v2.2 to float + ret float %ret +} + +define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { +; GCN-LABEL: fptrunc_f32_to_bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v1, v1, v0, s0 +; GCN-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.cvt = fptrunc float %a to bfloat + store bfloat %a.cvt, ptr %out + ret void +} + +define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { +; GCN-LABEL: fptrunc_f32_to_bf16_abs: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0 +; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v1, s0 +; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0| +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.abs = call float @llvm.fabs.f32(float %a) + %a.cvt = fptrunc float %a.abs to bfloat + store bfloat %a.cvt, ptr %out + ret void +} + +define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { +; GCN-LABEL: fptrunc_f32_to_bf16_neg: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v3, v2 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v0 +; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v1, s0 +; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.neg = fneg float %a + %a.cvt = fptrunc float %a.neg to bfloat + store bfloat %a.cvt, ptr %out + ret void +} + +define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { +; GCN-LABEL: fptrunc_f64_to_bf16: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GCN-NEXT: v_and_b32_e32 v7, 1, v6 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v4, v6, v4 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: s_brev_b32 s0, 1 +; GCN-NEXT: v_and_or_b32 v5, v1, s0, v4 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 +; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.cvt = fptrunc double %a to bfloat + store bfloat %a.cvt, ptr %out + ret void +} + +define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { +; GCN-LABEL: fptrunc_f64_to_bf16_neg: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GCN-NEXT: v_and_b32_e32 v8, 1, v7 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v4, v7, v4 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: s_brev_b32 s4, 1 +; GCN-NEXT: v_xor_b32_e32 v6, 0x80000000, v1 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: v_and_or_b32 v5, v6, s4, v4 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 +; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GCN-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.neg = fneg double %a + %a.cvt = fptrunc double %a.neg to bfloat + store bfloat %a.cvt, ptr %out + ret void +} + +define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { +; GCN-LABEL: fptrunc_f64_to_bf16_abs: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]| +; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GCN-NEXT: v_and_b32_e32 v8, 1, v7 +; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] +; GCN-NEXT: v_add_u32_e32 v4, v7, v4 +; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc +; GCN-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1 +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-NEXT: s_brev_b32 s0, 1 +; GCN-NEXT: v_and_or_b32 v5, v6, s0, v4 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1 +; GCN-NEXT: s_movk_i32 s0, 0x7fff +; GCN-NEXT: v_add3_u32 v4, v4, v5, s0 +; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GCN-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1 +; GCN-NEXT: s_endpgm +entry: + %a.abs = call double @llvm.fabs.f64(double %a) + %a.cvt = fptrunc double %a.abs to bfloat + store bfloat %a.cvt, ptr %out + ret void +} + +declare float @llvm.fabs.f32(float) +declare double @llvm.fabs.f64(double) diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index ebb77c1..9865883 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -16968,7 +16968,7 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -16977,7 +16977,7 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -17163,9 +17163,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0 +; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; @@ -17174,9 +17174,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, -1.0, v0 +; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -17280,8 +17280,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -17293,8 +17291,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -17375,10 +17371,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v1, v1, v3 ; GCN-NEXT: v_min_f32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -17396,10 +17388,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -17522,12 +17510,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v2, v2, v5 ; GCN-NEXT: v_min_f32_e32 v1, v1, v4 ; GCN-NEXT: v_min_f32_e32 v0, v0, v3 @@ -17551,12 +17533,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 @@ -17688,14 +17664,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v3, v3, v7 ; GCN-NEXT: v_min_f32_e32 v2, v2, v6 ; GCN-NEXT: v_min_f32_e32 v1, v1, v5 @@ -17725,14 +17693,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 @@ -17951,22 +17911,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v7, v7, v15 ; GCN-NEXT: v_min_f32_e32 v6, v6, v14 ; GCN-NEXT: v_min_f32_e32 v5, v5, v13 @@ -18020,22 +17964,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v15 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v14 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v13 @@ -18382,71 +18310,51 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_min_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_min_f32_e32 v13, v13, v29 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_min_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_min_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_min_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_min_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_min_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_min_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_min_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_min_f32_e32 v5, v5, v21 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 @@ -18461,8 +18369,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_min_f32_e32 v4, v4, v20 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -18474,21 +18380,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_min_f32_e32 v3, v3, v19 ; GCN-NEXT: v_min_f32_e32 v2, v2, v18 ; GCN-NEXT: v_min_f32_e32 v1, v1, v17 ; GCN-NEXT: v_min_f32_e32 v0, v0, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -18503,8 +18398,9 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_min_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -18513,14 +18409,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_minnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 @@ -18531,13 +18425,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -18560,13 +18454,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 @@ -18579,48 +18473,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_min_f32_e32 v15, v15, v25 +; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 @@ -18634,6 +18494,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_min_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -19267,287 +19131,223 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_min_f32_e32 v31, v31, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_min_f32_e32 v30, v30, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_min_f32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_min_f32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_min_f32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_min_f32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_min_f32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_min_f32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_min_f32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_min_f32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_min_f32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_min_f32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_min_f32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_min_f32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_min_f32_e32 v17, v17, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_min_f32_e32 v16, v16, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_min_f32_e32 v15, v15, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_min_f32_e32 v14, v14, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_min_f32_e32 v13, v13, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_min_f32_e32 v12, v12, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_min_f32_e32 v11, v11, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_min_f32_e32 v10, v10, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_min_f32_e32 v9, v9, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_min_f32_e32 v8, v8, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_min_f32_e32 v7, v7, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_min_f32_e32 v6, v6, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_min_f32_e32 v5, v5, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_min_f32_e32 v4, v4, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_min_f32_e32 v3, v3, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_min_f32_e32 v2, v2, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_min_f32_e32 v1, v1, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_min_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -19590,322 +19390,258 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_min_f32_e32 v31, v31, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_min_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_min_f32_e32 v0, v0, v32 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21097,8 +20833,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v0, v0, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -21110,8 +20844,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -21192,10 +20924,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v1, v1, v3 ; GCN-NEXT: v_max_f32_e32 v0, v0, v2 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -21213,10 +20941,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 @@ -21339,12 +21063,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v2, v2, v5 ; GCN-NEXT: v_max_f32_e32 v1, v1, v4 ; GCN-NEXT: v_max_f32_e32 v0, v0, v3 @@ -21368,12 +21086,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 @@ -21505,14 +21217,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v3, v3, v7 ; GCN-NEXT: v_max_f32_e32 v2, v2, v6 ; GCN-NEXT: v_max_f32_e32 v1, v1, v5 @@ -21542,14 +21246,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 @@ -21768,22 +21464,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v7, v7, v15 ; GCN-NEXT: v_max_f32_e32 v6, v6, v14 ; GCN-NEXT: v_max_f32_e32 v5, v5, v13 @@ -21837,22 +21517,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v15 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v14 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v13 @@ -22199,71 +21863,51 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_max_f32_e32 v14, v14, v30 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_max_f32_e32 v13, v13, v29 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_max_f32_e32 v12, v12, v28 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_max_f32_e32 v11, v11, v27 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_max_f32_e32 v10, v10, v26 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_max_f32_e32 v9, v9, v25 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_max_f32_e32 v8, v8, v24 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_max_f32_e32 v7, v7, v23 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_max_f32_e32 v6, v6, v22 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_max_f32_e32 v5, v5, v21 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 @@ -22278,8 +21922,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_max_f32_e32 v4, v4, v20 ; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -22291,21 +21933,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_max_f32_e32 v3, v3, v19 ; GCN-NEXT: v_max_f32_e32 v2, v2, v18 ; GCN-NEXT: v_max_f32_e32 v1, v1, v17 ; GCN-NEXT: v_max_f32_e32 v0, v0, v16 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 @@ -22320,8 +21951,9 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_max_f32_e32 v15, v15, v16 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 @@ -22330,14 +21962,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-LABEL: v_maxnum_v16bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 @@ -22348,13 +21978,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 @@ -22377,13 +22007,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 @@ -22396,48 +22026,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v30 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v29 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX7-NEXT: v_max_f32_e32 v15, v15, v25 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 @@ -22451,6 +22047,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_max_f32_e32 v15, v15, v22 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 @@ -23084,287 +22684,223 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_max_f32_e32 v31, v31, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_max_f32_e32 v30, v30, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_max_f32_e32 v29, v29, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_max_f32_e32 v28, v28, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_max_f32_e32 v27, v27, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_max_f32_e32 v26, v26, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_max_f32_e32 v25, v25, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_max_f32_e32 v24, v24, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_max_f32_e32 v23, v23, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_max_f32_e32 v22, v22, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_max_f32_e32 v21, v21, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_max_f32_e32 v20, v20, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_max_f32_e32 v19, v19, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_max_f32_e32 v18, v18, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_max_f32_e32 v17, v17, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_max_f32_e32 v16, v16, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_max_f32_e32 v15, v15, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_max_f32_e32 v14, v14, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_max_f32_e32 v13, v13, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_max_f32_e32 v12, v12, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_max_f32_e32 v11, v11, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_max_f32_e32 v10, v10, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_max_f32_e32 v9, v9, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_max_f32_e32 v8, v8, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_max_f32_e32 v7, v7, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_max_f32_e32 v6, v6, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_max_f32_e32 v5, v5, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_max_f32_e32 v4, v4, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_max_f32_e32 v3, v3, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_max_f32_e32 v2, v2, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_max_f32_e32 v1, v1, v32 -; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33 ; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GCN-NEXT: v_max_f32_e32 v0, v0, v32 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 @@ -23407,322 +22943,258 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 ; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 ; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 ; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 ; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 ; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 ; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 ; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 ; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 ; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 ; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 ; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 ; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 ; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 ; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 ; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 ; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 ; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 ; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 ; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 ; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 -; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 -; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_max_f32_e32 v31, v31, v32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; GFX7-NEXT: v_max_f32_e32 v30, v30, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 ; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v29, v29, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 ; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v28, v28, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 ; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v27, v27, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 ; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v26, v26, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 ; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v25, v25, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 ; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v24, v24, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 ; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v23, v23, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 ; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v22, v22, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v21, v21, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 ; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v20, v20, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 ; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v19, v19, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v18, v18, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 ; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v17, v17, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 ; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v16, v16, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 ; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v15, v15, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 ; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v14, v14, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v13, v13, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 ; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v12, v12, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 ; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v11, v11, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 ; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v10, v10, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v9, v9, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v8, v8, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v3, v3, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v2, v2, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v1, v1, v32 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 -; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v32 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -25176,7 +24648,6 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0 ; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -26818,11 +26289,17 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GCN-LABEL: v_canonicalize_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_canonicalize_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_canonicalize_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll index dfadd8d..9472845 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -2996,18 +2996,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 +; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -3095,16 +3093,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 2.0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp +; GFX6-NEXT: v_max_f32_e32 v2, 2.0, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; @@ -3198,9 +3195,8 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -3760,19 +3756,17 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 -; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000 ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 +; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v3, v3, s2, 1.0 +; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 +; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 @@ -3863,18 +3857,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out ; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000 ; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2 -; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3 -; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4 -; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2 +; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll index 2ed6d7f..1c8725f 100644 --- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll +++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll @@ -1,10 +1,12 @@ ; RUN: llc --amdgpu-disable-structurizer -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s ; RUN: llc --amdgpu-disable-structurizer -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s +; RUN: llc --amdgpu-disable-structurizer -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL ; CHECK-LABEL: name: basic_call -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY -; ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @foo, [[TOKEN]], csr_amdgpu, {{.*}} +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY +; ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] ; DEADMI: {{.*}} SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] +; GISEL: {{.*}} G_SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]] define i32 @basic_call(i32 %src) #0 { %t = call token @llvm.experimental.convergence.entry() %r = call i32 @foo(i32 %src) [ "convergencectrl"(token %t) ] @@ -12,10 +14,11 @@ define i32 @basic_call(i32 %src) #0 { } ; CHECK-LABEL: name: basic_intrinsic -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR ; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]] ; DEADMI-NOT: CONVERGENCECTRL_GLUE -; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]] define i32 @basic_intrinsic(i32 %src) #0 { %t = call token @llvm.experimental.convergence.anchor() %r = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t) ] @@ -30,12 +33,13 @@ define i32 @uncontrolled_call(i32 %src) #0 { } ; CHECK-LABEL: name: basic_branch -; CHECK: bb.0.entry: -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR -; CHECK: bb.1.then: +; CHECK: bb.[[#]].entry: +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR +; CHECK: bb.[[#]].then: ; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]] ; DEADMI-NOT: CONVERGENCECTRL_GLUE -; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]] define i32 @basic_branch(i32 %src, i1 %cond) #0 { entry: %t = call token @llvm.experimental.convergence.anchor() @@ -52,12 +56,13 @@ else: } ; CHECK-LABEL: name: basic_loop -; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR -; CHECK: bb.1.loop: -; CHECK: [[LOOP:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_LOOP [[TOKEN]] +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR +; CHECK: bb.[[#]].loop: +; CHECK: [[LOOP:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_LOOP [[TOKEN]] ; ISEL: CONVERGENCECTRL_GLUE [[LOOP]] ; DEADMI-NOT: CONVERGENCECTRL_GLUE -; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]] +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[LOOP]] define i32 @basic_loop(i32 %src, i1 %cond) #0 { %t1 = call token @llvm.experimental.convergence.anchor() br label %loop @@ -71,6 +76,32 @@ end: ret i32 %r } +; CHECK-LABEL: name: nested +; CHECK: [[ENTRY:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY +; CHECK: [[ANCHOR:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ANCHOR]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ANCHOR]] +; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ENTRY]] +; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ENTRY]] +define i32 @nested(i32 %src) #0 { + %t1 = call token @llvm.experimental.convergence.entry() + %t2 = call token @llvm.experimental.convergence.anchor() + %r2 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t2) ] + %r1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t1) ] + %sum = add i32 %r1, %r2 + ret i32 %sum +} + +; CHECK-LABEL: name: tail_call_void_func_void +; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY +; CHECK: {{.*}} SI_TCRETURN {{.*}}, @external_void_func_void, 0, csr_amdgpu, {{.*}}implicit [[TOKEN]] +define void @tail_call_void_func_void() #0 { + %t1 = call token @llvm.experimental.convergence.entry() + tail call void @external_void_func_void() [ "convergencectrl"(token %t1) ] + ret void +} + +declare hidden void @external_void_func_void() #0 declare i32 @foo(i32 %x) #0 declare i32 @llvm.amdgcn.readfirstlane(i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir index 895185c..577d38e 100644 --- a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir @@ -333,7 +333,7 @@ ret void } - attributes #0 = { "amdgpu-waves-per-eu"="4,4" } + attributes #0 = { "amdgpu-waves-per-eu"="4,4" "amdgpu-no-agpr" } ... --- diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll index 0c03419..386f9cd 100644 --- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll @@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 2f3d5d9..cf99b5d 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1,10 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s +; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0 %s -; FIXME: GlobalISel missing the power-of-2 cases in legalization. https://github.com/llvm/llvm-project/issues/80671 -; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s -; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G %s +; RUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G-O0 %s define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-LABEL: v_sdiv_i128_vv: @@ -1223,6 +1222,1158 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_sdiv_i128_vv: +; GFX9-G: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0 +; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2 +; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7 +; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3 +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc +; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4 +; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5 +; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v17, v6 +; GFX9-G-NEXT: v_subb_co_u32_e32 v19, vcc, v1, v17, vcc +; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v7 +; GFX9-G-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v17, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v17, vcc +; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4 +; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18 +; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19 +; GFX9-G-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v4 +; GFX9-G-NEXT: v_min_u32_e32 v0, v0, v1 +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v5 +; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5] +; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0 +; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10 +; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7] +; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11 +; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12 +; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2 +; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13 +; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13] +; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1 +; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3 +; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7] +; GFX9-G-NEXT: v_sub_co_u32_e64 v0, s[6:7], v0, v1 +; GFX9-G-NEXT: v_subb_co_u32_e64 v1, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v6, 0x7f +; GFX9-G-NEXT: v_subb_co_u32_e64 v2, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-NEXT: v_subb_co_u32_e64 v3, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[6:7] +; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[2:3] +; GFX9-G-NEXT: v_or_b32_e32 v15, v1, v3 +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[6:7] +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v20, v7, v6 +; GFX9-G-NEXT: v_xor_b32_e32 v6, 0x7f, v0 +; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2 +; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14 +; GFX9-G-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-G-NEXT: s_cbranch_execz .LBB0_6 +; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, 1, v0 +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v1, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0 +; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] +; GFX9-G-NEXT: v_subrev_u32_e32 v9, 64, v8 +; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] +; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] +; GFX9-G-NEXT: v_subrev_u32_e32 v24, 64, v20 +; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] +; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20 +; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v24, vcc, -1, v18 +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20 +; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5] +; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v11, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v2, s10 +; GFX9-G-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while +; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7 +; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] +; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v24, v2 +; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v26, v0, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v27, v1, vcc +; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v12 +; GFX9-G-NEXT: v_and_b32_e32 v12, v28, v18 +; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v12 +; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v19 +; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v2, vcc +; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4 +; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc +; GFX9-G-NEXT: v_and_b32_e32 v0, v28, v5 +; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v0, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, -1, v20 +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22 +; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10 +; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28 +; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3 +; GFX9-G-NEXT: ; %bb.4: ; %Flow +; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: .LBB0_5: ; %Flow2 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4 +; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3 +; GFX9-G-NEXT: .LBB0_6: ; %Flow3 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v16 +; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3 +; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 +; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3 +; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3 +; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_sdiv_i128_vv: +; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v1 +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v12, v3, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v10, v1, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v11, v3, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s6, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v9, v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v12, v1 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v10, v3 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v10, v2 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v12 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[6:7], v4, v12, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v6, s[6:7], v3, v10, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v5, s[6:7], v2, v10, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v5, v11, v5 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v8, v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v14 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v9, v7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v9, v6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v5, s[6:7], v5, v11 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v15, s[6:7], v8, v11, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v14, s[6:7], v7, v9, s[6:7] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v13, s[6:7], v6, v9, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v13, v11, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v11, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v9, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_xor_b32_e64 v9, v9, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s16, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s15, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6 +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s16 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s14 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-G-O0-NEXT: s_branch .LBB0_8 +; GFX9-G-O0-NEXT: .LBB0_1: ; %Flow +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_5 +; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_9 +; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v1, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_or3_b32 v4, v4, v6, v7 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_3 +; GFX9-G-O0-NEXT: .LBB0_5: ; %Flow1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_4 +; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4] +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13] +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22 +; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v8, s9 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, s8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24 +; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 +; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v6, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s11, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20 +; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19 +; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] +; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB0_6 +; GFX9-G-O0-NEXT: s_branch .LBB0_1 +; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25 +; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s6, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB0_6 +; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_5 +; GFX9-G-O0-NEXT: s_branch .LBB0_7 +; GFX9-G-O0-NEXT: .LBB0_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v0, v0, v8 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v1, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v10 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v2, v6 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v3, v5 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v0, s[4:5], v0, v8 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v7, s[4:5] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v2, s[4:5], v2, v6, s[4:5] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; GFX9-G-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = sdiv i128 %lhs, %rhs ret i128 %div } @@ -2306,6 +3457,1043 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_udiv_i128_vv: +; GFX9-G: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_or_b32_e32 v8, v4, v6 +; GFX9-G-NEXT: v_or_b32_e32 v9, v5, v7 +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v8, v0, v2 +; GFX9-G-NEXT: v_or_b32_e32 v9, v1, v3 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v4 +; GFX9-G-NEXT: v_ffbh_u32_e32 v8, v5 +; GFX9-G-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v6 +; GFX9-G-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v7 +; GFX9-G-NEXT: v_add_u32_e32 v10, 32, v10 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7] +; GFX9-G-NEXT: v_add_u32_e32 v8, 64, v8 +; GFX9-G-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] +; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v1 +; GFX9-G-NEXT: v_add_u32_e32 v10, 32, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v11, v2 +; GFX9-G-NEXT: v_min_u32_e32 v9, v9, v10 +; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v3 +; GFX9-G-NEXT: v_add_u32_e32 v11, 32, v11 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] +; GFX9-G-NEXT: v_add_u32_e32 v9, 64, v9 +; GFX9-G-NEXT: v_min_u32_e32 v10, v10, v11 +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v10, v9, s[6:7] +; GFX9-G-NEXT: v_sub_co_u32_e64 v12, s[6:7], v8, v9 +; GFX9-G-NEXT: v_subb_co_u32_e64 v13, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v8, 0x7f +; GFX9-G-NEXT: v_subb_co_u32_e64 v14, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-G-NEXT: v_subb_co_u32_e64 v15, s[6:7], 0, 0, s[6:7] +; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[12:13], v[8:9] +; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[14:15] +; GFX9-G-NEXT: v_or_b32_e32 v17, v13, v15 +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7] +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7] +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GFX9-G-NEXT: v_or_b32_e32 v18, v9, v8 +; GFX9-G-NEXT: v_xor_b32_e32 v8, 0x7f, v12 +; GFX9-G-NEXT: v_or_b32_e32 v16, v8, v14 +; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v18 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc +; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16 +; GFX9-G-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GFX9-G-NEXT: s_cbranch_execz .LBB1_6 +; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1 +; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12 +; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v13, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v14, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v15, vcc +; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GFX9-G-NEXT: v_sub_co_u32_e32 v16, vcc, 0x7f, v12 +; GFX9-G-NEXT: v_sub_u32_e32 v8, 64, v16 +; GFX9-G-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], v16, v[2:3] +; GFX9-G-NEXT: v_subrev_u32_e32 v14, 64, v16 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v16, v[0:1] +; GFX9-G-NEXT: v_or_b32_e32 v10, v8, v10 +; GFX9-G-NEXT: v_or_b32_e32 v11, v9, v11 +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], v14, v[0:1] +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v13, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16 +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader +; GFX9-G-NEXT: v_sub_u32_e32 v12, 64, v18 +; GFX9-G-NEXT: v_subrev_u32_e32 v22, 64, v18 +; GFX9-G-NEXT: v_lshrrev_b64 v[10:11], v18, v[0:1] +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b64 v[16:17], v18, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b64 v[2:3], v22, v[2:3] +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v13 +; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 +; GFX9-G-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GFX9-G-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GFX9-G-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4 +; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc +; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9] +; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 +; GFX9-G-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v13, s11 +; GFX9-G-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; GFX9-G-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5] +; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-NEXT: v_mov_b32_e32 v11, s9 +; GFX9-G-NEXT: v_mov_b32_e32 v10, s8 +; GFX9-G-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while +; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[14:15] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v15 +; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v12 +; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v13 +; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[16:17] +; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v3 +; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v2 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v9 +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0 +; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v22, v2 +; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v23, v11, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v24, v12, vcc +; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v25, v13, vcc +; GFX9-G-NEXT: v_add_co_u32_e64 v18, s[4:5], -1, v18 +; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX9-G-NEXT: v_addc_co_u32_e64 v19, s[4:5], -1, v19, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v10, v3, v4 +; GFX9-G-NEXT: v_addc_co_u32_e64 v20, s[4:5], -1, v20, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v16, v3, v5 +; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v10 +; GFX9-G-NEXT: v_addc_co_u32_e64 v21, s[4:5], -1, v21, s[4:5] +; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX9-G-NEXT: v_and_b32_e32 v17, v3, v6 +; GFX9-G-NEXT: v_and_b32_e32 v26, v3, v7 +; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v11, v16, vcc +; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20 +; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21 +; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v12, v17, vcc +; GFX9-G-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v13, v26, vcc +; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX9-G-NEXT: v_mov_b32_e32 v10, v0 +; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3 +; GFX9-G-NEXT: ; %bb.4: ; %Flow +; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-G-NEXT: .LBB1_5: ; %Flow2 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15] +; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v15 +; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v2 +; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v0 +; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v1 +; GFX9-G-NEXT: .LBB1_6: ; %Flow3 +; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-G-NEXT: v_mov_b32_e32 v0, v10 +; GFX9-G-NEXT: v_mov_b32_e32 v1, v11 +; GFX9-G-NEXT: v_mov_b32_e32 v2, v8 +; GFX9-G-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_udiv_i128_vv: +; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v4 +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12 +; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11 +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12] +; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s14, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10 +; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6 +; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32 +; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9 +; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s13, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s11, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s12, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6 +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9] +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13 +; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7 +; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9 +; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1 +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-G-O0-NEXT: s_branch .LBB1_8 +; GFX9-G-O0-NEXT: .LBB1_1: ; %Flow +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_5 +; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_9 +; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v7 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v1, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9 +; GFX9-G-O0-NEXT: v_or3_b32 v4, v4, v6, v7 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v5 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_3 +; GFX9-G-O0-NEXT: .LBB1_5: ; %Flow1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4 +; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5 +; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_4 +; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while +; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6 +; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4] +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s9, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13] +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec +; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22 +; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15 +; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9] +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v10 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v8, s9 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, s8 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4 +; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24 +; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11 +; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22 +; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8 +; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11 +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9] +; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v6, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s8, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s12, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s11, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19 +; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20 +; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19 +; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4 +; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20] +; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12 +; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3 +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB1_6 +; GFX9-G-O0-NEXT: s_branch .LBB1_1 +; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-preheader +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b32 s4, 64 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25 +; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23 +; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13 +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] +; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20 +; GFX9-G-O0-NEXT: s_mov_b32 s4, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s7, -1 +; GFX9-G-O0-NEXT: s_mov_b32 s6, -1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17 +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5] +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9] +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6 +; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s4 +; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_branch .LBB1_6 +; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-G-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s9, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3) +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5 +; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10 +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7 +; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14] +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14] +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15 +; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9] +; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12 +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7] +; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8 +; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7 +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4 +; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8] +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec +; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4 +; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-G-O0-NEXT: s_branch .LBB1_7 +; GFX9-G-O0-NEXT: .LBB1_9: ; %udiv-end +; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v8 +; GFX9-G-O0-NEXT: ; kill: killed $vgpr4 +; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_nop 0 +; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = udiv i128 %lhs, %rhs ret i128 %div } @@ -2388,6 +4576,66 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_sdiv_i128_v_pow2k: +; GFX9-G: ; %bb.0: +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX9-G-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-G-NEXT: v_lshrrev_b64 v[4:5], 31, v[4:5] +; GFX9-G-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX9-G-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-G-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v3, 1, v4 +; GFX9-G-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX9-G-NEXT: v_ashrrev_i32_e32 v2, 1, v2 +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_sdiv_i128_v_pow2k: +; GFX9-G-O0: ; %bb.0: +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v0, v0, v3 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v0, v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-G-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0 +; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v5 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v1, s[6:7], v1, v0, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v2, v0, s[6:7] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v3, v0, s[6:7] +; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[5:6] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v3 +; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v3, v2, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v2, v2, v4 +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = sdiv i128 %lhs, 8589934592 ret i128 %div } @@ -2434,10 +4682,42 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-LABEL: v_udiv_i128_v_pow2k: +; GFX9-G: ; %bb.0: +; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3] +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v4 +; GFX9-G-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v3 +; GFX9-G-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-G-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-G-O0-LABEL: v_udiv_i128_v_pow2k: +; GFX9-G-O0: ; %bb.0: +; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 31 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[4:5] +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6 +; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v4 +; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 1 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v2, v3 +; GFX9-G-O0-NEXT: s_mov_b32 s4, 0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4 +; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31] %div = udiv i128 %lhs, 8589934592 ret i128 %div } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX9-SDAG: {{.*}} -; GFX9-SDAG-O0: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 46e2632..16a03ba 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -1,25 +1,3248 @@ -; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s -; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s - -; SDAG-ERR: LLVM ERROR: unsupported libcall legalization -; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_v2i128_vv) +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GISEL %s define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { +; SDAG-LABEL: v_sdiv_v2i128_vv: +; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 +; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_mov_b32_e32 v26, v24 +; SDAG-NEXT: v_mov_b32_e32 v27, v25 +; SDAG-NEXT: v_xor_b32_e32 v17, v24, v3 +; SDAG-NEXT: v_xor_b32_e32 v18, v24, v2 +; SDAG-NEXT: v_xor_b32_e32 v1, v24, v1 +; SDAG-NEXT: v_xor_b32_e32 v0, v24, v0 +; SDAG-NEXT: v_xor_b32_e32 v19, v25, v11 +; SDAG-NEXT: v_xor_b32_e32 v20, v25, v10 +; SDAG-NEXT: v_xor_b32_e32 v9, v25, v9 +; SDAG-NEXT: v_xor_b32_e32 v8, v25, v8 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v24 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v24, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v0, v2 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v18, v24, vcc +; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v3 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v17, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v0, v2, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v17, v10 +; SDAG-NEXT: v_min_u32_e32 v18, v1, v18 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v8, v25 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v11 +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v17 +; SDAG-NEXT: v_ffbh_u32_e32 v17, v11 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18 +; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v9, v25, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v28 +; SDAG-NEXT: v_min_u32_e32 v8, v8, v17 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v20, v25, vcc +; SDAG-NEXT: v_add_i32_e64 v9, s[8:9], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v29 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v8, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v19, v25, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v28, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v19, v0 +; SDAG-NEXT: v_min_u32_e32 v20, v9, v20 +; SDAG-NEXT: v_or_b32_e32 v9, v29, v1 +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 32, v19 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 +; SDAG-NEXT: v_add_i32_e32 v20, vcc, 64, v20 +; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_min_u32_e32 v8, v19, v21 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v22, 0, s[6:7] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v18 +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v17, 0x7f, v8 +; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v16, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v16, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v17, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v9, v19 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v20 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v11, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v10, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v3, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_6 +; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v8 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v9, vcc +; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v18, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v8 +; SDAG-NEXT: v_or_b32_e32 v19, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[10:11], v34 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v34 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v35 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v19 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_5 +; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 +; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v30 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[10:11], v30 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[10:11], v35 +; SDAG-NEXT: v_lshr_b64 v[10:11], v[10:11], v36 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v17, v49 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v48 +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v11, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v10, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v37, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3 +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v16 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v39 +; SDAG-NEXT: v_or_b32_e32 v9, v19, v9 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v2 +; SDAG-NEXT: v_or_b32_e32 v8, v18, v8 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v11, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v16 +; SDAG-NEXT: v_and_b32_e32 v39, v38, v28 +; SDAG-NEXT: v_and_b32_e32 v48, v38, v29 +; SDAG-NEXT: v_and_b32_e32 v49, v38, v0 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v38 +; SDAG-NEXT: v_and_b32_e32 v38, v38, v1 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v39 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v49, vcc +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v38, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 +; SDAG-NEXT: v_mov_b32_e32 v23, v17 +; SDAG-NEXT: v_mov_b32_e32 v22, v16 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB0_3 +; SDAG-NEXT: ; %bb.4: ; %Flow13 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB0_5: ; %Flow14 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 +; SDAG-NEXT: v_or_b32_e32 v20, v19, v1 +; SDAG-NEXT: v_or_b32_e32 v21, v17, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v18, v0 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v2 +; SDAG-NEXT: .LBB0_6: ; %Flow16 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_ashrrev_i32_e32 v18, 31, v7 +; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v15 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_mov_b32_e32 v22, v18 +; SDAG-NEXT: v_mov_b32_e32 v23, v19 +; SDAG-NEXT: v_xor_b32_e32 v0, v18, v7 +; SDAG-NEXT: v_xor_b32_e32 v1, v18, v6 +; SDAG-NEXT: v_xor_b32_e32 v3, v18, v5 +; SDAG-NEXT: v_xor_b32_e32 v2, v18, v4 +; SDAG-NEXT: v_xor_b32_e32 v6, v19, v15 +; SDAG-NEXT: v_xor_b32_e32 v7, v19, v14 +; SDAG-NEXT: v_xor_b32_e32 v8, v19, v13 +; SDAG-NEXT: v_xor_b32_e32 v10, v19, v12 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v18 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v18, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v5, v2 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v1, v18, vcc +; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v3 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v0, v18, vcc +; SDAG-NEXT: v_or_b32_e32 v0, v2, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v12, v4 +; SDAG-NEXT: v_min_u32_e32 v11, v1, v11 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v10, v19 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v5 +; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v12 +; SDAG-NEXT: v_ffbh_u32_e32 v12, v5 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v11 +; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v8, v19, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; SDAG-NEXT: v_ffbh_u32_e32 v1, v28 +; SDAG-NEXT: v_min_u32_e32 v8, v10, v12 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v13, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v7, v19, vcc +; SDAG-NEXT: v_add_i32_e64 v7, s[8:9], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v12, v29 +; SDAG-NEXT: v_cndmask_b32_e64 v8, v11, v8, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v6, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v6, v28, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 +; SDAG-NEXT: v_min_u32_e32 v12, v7, v12 +; SDAG-NEXT: v_or_b32_e32 v7, v29, v1 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v13, v1 +; SDAG-NEXT: v_add_i32_e32 v12, vcc, 64, v12 +; SDAG-NEXT: v_addc_u32_e64 v14, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_min_u32_e32 v6, v11, v13 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v14, 0, s[6:7] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc +; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6 +; SDAG-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v9, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v10, v8 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v11, v7, v9 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_and_b32_e32 v10, 1, v12 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_12 +; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 +; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc +; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v7, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v9, vcc, 0x7f, v6 +; SDAG-NEXT: v_or_b32_e32 v8, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v9 +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, 64, v9 +; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v9 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] +; SDAG-NEXT: v_lshr_b64 v[6:7], v[2:3], v6 +; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 +; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v8, v13, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v35, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v34, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v13, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_11 +; SDAG-NEXT: ; %bb.8: ; %udiv-preheader +; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30 +; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v14, 0 +; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_mov_b32_e32 v12, 0 +; SDAG-NEXT: v_mov_b32_e32 v13, 0 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35 +; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc +; SDAG-NEXT: v_or_b32_e32 v11, v11, v49 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v48 +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v37, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v11, 0 +; SDAG-NEXT: .LBB0_9: ; %udiv-do-while +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9 +; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v7 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 +; SDAG-NEXT: v_or_b32_e32 v8, v8, v39 +; SDAG-NEXT: v_or_b32_e32 v9, v13, v9 +; SDAG-NEXT: v_or_b32_e32 v7, v15, v7 +; SDAG-NEXT: v_or_b32_e32 v8, v12, v8 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v34, v2 +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v35, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v36, v4, vcc +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v5, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v15, 31, v10 +; SDAG-NEXT: v_and_b32_e32 v10, 1, v15 +; SDAG-NEXT: v_and_b32_e32 v38, v15, v1 +; SDAG-NEXT: v_and_b32_e32 v39, v15, v0 +; SDAG-NEXT: v_and_b32_e32 v48, v15, v29 +; SDAG-NEXT: v_and_b32_e32 v15, v15, v28 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v38, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 +; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v6, v14, v6 +; SDAG-NEXT: v_mov_b32_e32 v15, v11 +; SDAG-NEXT: v_mov_b32_e32 v14, v10 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB0_9 +; SDAG-NEXT: ; %bb.10: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB0_11: ; %Flow11 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_or_b32_e32 v13, v13, v1 +; SDAG-NEXT: v_or_b32_e32 v14, v11, v3 +; SDAG-NEXT: v_or_b32_e32 v11, v12, v0 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v2 +; SDAG-NEXT: .LBB0_12: ; %Flow12 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26 +; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24 +; SDAG-NEXT: v_xor_b32_e32 v7, v23, v22 +; SDAG-NEXT: v_xor_b32_e32 v6, v19, v18 +; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3 +; SDAG-NEXT: v_xor_b32_e32 v5, v17, v2 +; SDAG-NEXT: v_xor_b32_e32 v1, v21, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, v16, v2 +; SDAG-NEXT: v_xor_b32_e32 v8, v13, v7 +; SDAG-NEXT: v_xor_b32_e32 v9, v11, v6 +; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc +; SDAG-NEXT: v_xor_b32_e32 v4, v10, v6 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_sdiv_v2i128_vv: +; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v24, 31, v3 +; GISEL-NEXT: v_ashrrev_i32_e32 v25, 31, v11 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GISEL-NEXT: v_xor_b32_e32 v0, v24, v0 +; GISEL-NEXT: v_xor_b32_e32 v1, v24, v1 +; GISEL-NEXT: v_xor_b32_e32 v2, v24, v2 +; GISEL-NEXT: v_xor_b32_e32 v3, v24, v3 +; GISEL-NEXT: v_xor_b32_e32 v8, v25, v8 +; GISEL-NEXT: v_xor_b32_e32 v9, v25, v9 +; GISEL-NEXT: v_xor_b32_e32 v10, v25, v10 +; GISEL-NEXT: v_xor_b32_e32 v11, v25, v11 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v24 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v24, vcc +; GISEL-NEXT: v_sub_i32_e64 v26, s[4:5], v8, v25 +; GISEL-NEXT: v_subb_u32_e64 v27, s[4:5], v9, v25, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v24, vcc +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v3, v24, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v25, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v25, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v8, v27 +; GISEL-NEXT: v_ffbh_u32_e32 v9, v26 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v26, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v27, v11 +; GISEL-NEXT: v_or_b32_e32 v2, v16, v18 +; GISEL-NEXT: v_or_b32_e32 v3, v17, v19 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 32, v9 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v10 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v30, v19 +; GISEL-NEXT: v_ffbh_u32_e32 v31, v18 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GISEL-NEXT: v_min_u32_e32 v0, v8, v9 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v29 +; GISEL-NEXT: v_min_u32_e32 v2, v22, v23 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v31 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v28, v1 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v30, v3 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v8, v8, v2 +; GISEL-NEXT: v_or_b32_e32 v9, v1, v3 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v9, v22, v20 +; GISEL-NEXT: v_and_b32_e32 v20, 1, v9 +; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, v16, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v22, 1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v21, v17, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v18, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, v19, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB0_6 +; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v2, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc +; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v32 +; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v32 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[18:19], v32 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[8:9], v[16:17], v8 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32 +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v8, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v9, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v19, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB0_5 +; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 +; GISEL-NEXT: v_subrev_i32_e32 v34, vcc, 64, v28 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[18:19], v28 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v28 +; GISEL-NEXT: v_add_i32_e32 v32, vcc, -1, v26 +; GISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v28 +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v27, vcc +; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v22 +; GISEL-NEXT: v_lshr_b64 v[36:37], v[18:19], v34 +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v10, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v1, v3, v23 +; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v0, v36, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v1, v37, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v22, v0, v16, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v23, v1, v17, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v17, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: .LBB0_3: ; %udiv-do-while3 +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshrrev_b32_e32 v16, 31, v21 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[36:37], v[22:23], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 +; GISEL-NEXT: v_lshrrev_b32_e32 v23, 31, v9 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, -1, v28 +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc +; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v18, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v36, v23 +; GISEL-NEXT: v_addc_u32_e32 v30, vcc, -1, v30, vcc +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; GISEL-NEXT: v_or_b32_e32 v8, v8, v16 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v32, v3 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v37, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v28, v30 +; GISEL-NEXT: v_or_b32_e32 v1, v29, v31 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v34, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v19, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v16 +; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GISEL-NEXT: v_and_b32_e32 v1, v0, v26 +; GISEL-NEXT: v_and_b32_e32 v18, v0, v27 +; GISEL-NEXT: v_and_b32_e32 v16, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v36, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v3, v1 +; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v37, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v36, vcc +; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v19, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v16 +; GISEL-NEXT: v_mov_b32_e32 v1, v17 +; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GISEL-NEXT: s_cbranch_execnz .LBB0_3 +; GISEL-NEXT: ; %bb.4: ; %Flow13 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB0_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[14:15] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v21 +; GISEL-NEXT: v_or_b32_e32 v8, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: .LBB0_6: ; %Flow16 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15 +; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_xor_b32_e32 v0, v18, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v18, v5 +; GISEL-NEXT: v_xor_b32_e32 v2, v18, v6 +; GISEL-NEXT: v_xor_b32_e32 v3, v18, v7 +; GISEL-NEXT: v_xor_b32_e32 v4, v19, v12 +; GISEL-NEXT: v_xor_b32_e32 v5, v19, v13 +; GISEL-NEXT: v_xor_b32_e32 v14, v19, v14 +; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc +; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], v4, v19 +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc +; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v14, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v15, v22 +; GISEL-NEXT: v_ffbh_u32_e32 v16, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v17, v6 +; GISEL-NEXT: v_or_b32_e32 v0, v22, v4 +; GISEL-NEXT: v_or_b32_e32 v1, v23, v5 +; GISEL-NEXT: v_or_b32_e32 v2, v6, v12 +; GISEL-NEXT: v_or_b32_e32 v3, v7, v13 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v4 +; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v12 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GISEL-NEXT: v_min_u32_e32 v0, v14, v15 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27 +; GISEL-NEXT: v_min_u32_e32 v2, v16, v17 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v29 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v26, v1 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v28, v3 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v10, v10, v2 +; GISEL-NEXT: v_or_b32_e32 v11, v1, v3 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v11, v14, v15 +; GISEL-NEXT: v_and_b32_e32 v14, 1, v11 +; GISEL-NEXT: v_or_b32_e32 v10, v11, v10 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v16, 1, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB0_12 +; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v2, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v3, vcc +; GISEL-NEXT: v_subrev_i32_e64 v14, s[4:5], 64, v30 +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v10 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v14 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v10, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v11, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB0_11 +; GISEL-NEXT: ; %bb.8: ; %udiv-preheader +; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v22 +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v23, vcc +; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16 +; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32 +; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v5, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v16 +; GISEL-NEXT: v_or_b32_e32 v3, v3, v17 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v7, vcc +; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: .LBB0_9: ; %udiv-do-while +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v13 +; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v11 +; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 +; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v16, v6 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v14 +; GISEL-NEXT: v_or_b32_e32 v14, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v15, v1, v13 +; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v2 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v3, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v26, v28 +; GISEL-NEXT: v_or_b32_e32 v1, v27, v29 +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v32, v16, vcc +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v33, v17, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v6, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v12, v0, v22 +; GISEL-NEXT: v_and_b32_e32 v13, v0, v23 +; GISEL-NEXT: v_and_b32_e32 v34, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v35, v0, v5 +; GISEL-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-NEXT: v_mov_b32_e32 v1, v7 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v12 +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v13, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v34, vcc +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB0_9 +; GISEL-NEXT: ; %bb.10: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB0_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 +; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v15 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v4 +; GISEL-NEXT: v_or_b32_e32 v14, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v15, v1, v3 +; GISEL-NEXT: .LBB0_12: ; %Flow12 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24 +; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18 +; GISEL-NEXT: v_xor_b32_e32 v0, v20, v3 +; GISEL-NEXT: v_xor_b32_e32 v1, v21, v3 +; GISEL-NEXT: v_xor_b32_e32 v2, v8, v3 +; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3 +; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7 +; GISEL-NEXT: v_xor_b32_e32 v5, v15, v7 +; GISEL-NEXT: v_xor_b32_e32 v8, v10, v7 +; GISEL-NEXT: v_xor_b32_e32 v9, v11, v7 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v7 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v7, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v8, v7, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v9, v7, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = sdiv <2 x i128> %lhs, %rhs ret <2 x i128> %shl } define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { +; SDAG-LABEL: v_udiv_v2i128_vv: +; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v24, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v25, v3 +; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 +; SDAG-NEXT: v_mov_b32_e32 v28, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 +; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 +; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 +; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 +; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 +; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v23, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v23 +; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[23:24] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v26, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v25 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v24, v26 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_6 +; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 +; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v23 +; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v23 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v24, vcc +; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v25, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v26, vcc +; SDAG-NEXT: v_or_b32_e32 v19, v18, v28 +; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v23 +; SDAG-NEXT: v_or_b32_e32 v20, v27, v29 +; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v30 +; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30 +; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v30 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20] +; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v31 +; SDAG-NEXT: v_or_b32_e32 v20, v24, v20 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v26, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_5 +; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 +; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v18 +; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v18 +; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v18 +; SDAG-NEXT: v_lshr_b64 v[32:33], v[2:3], v18 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v8 +; SDAG-NEXT: s_mov_b64 s[12:13], 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 +; SDAG-NEXT: v_mov_b32_e32 v26, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18 +; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v31 +; SDAG-NEXT: v_lshr_b64 v[36:37], v[2:3], v36 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v33, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v32, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v22, v22, v35 +; SDAG-NEXT: v_or_b32_e32 v21, v21, v34 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v22, v37, v22, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v1, v22, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v21, v0, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3 +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshrrev_b32_e32 v21, 31, v24 +; SDAG-NEXT: v_lshl_b64 v[23:24], v[23:24], 1 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v35, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v24, v26, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v34 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v35 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v21 +; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v30, v0 +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v31, v1, vcc +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v32, v2, vcc +; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v33, v3, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v21 +; SDAG-NEXT: v_and_b32_e32 v25, v21, v8 +; SDAG-NEXT: v_and_b32_e32 v26, v21, v9 +; SDAG-NEXT: v_and_b32_e32 v34, v21, v10 +; SDAG-NEXT: v_and_b32_e32 v35, v21, v11 +; SDAG-NEXT: v_and_b32_e32 v21, 1, v21 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v26, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v34, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v35, vcc +; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc +; SDAG-NEXT: v_or_b32_e32 v25, v18, v28 +; SDAG-NEXT: v_or_b32_e32 v26, v27, v29 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26] +; SDAG-NEXT: v_or_b32_e32 v17, v20, v17 +; SDAG-NEXT: s_or_b64 s[12:13], vcc, s[12:13] +; SDAG-NEXT: v_or_b32_e32 v16, v19, v16 +; SDAG-NEXT: v_mov_b32_e32 v26, v22 +; SDAG-NEXT: v_mov_b32_e32 v25, v21 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[12:13] +; SDAG-NEXT: s_cbranch_execnz .LBB1_3 +; SDAG-NEXT: ; %bb.4: ; %Flow13 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB1_5: ; %Flow14 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: v_lshl_b64 v[0:1], v[16:17], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v24 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[23:24], 1 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v8 +; SDAG-NEXT: v_or_b32_e32 v16, v20, v1 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v3 +; SDAG-NEXT: v_or_b32_e32 v17, v19, v0 +; SDAG-NEXT: v_or_b32_e32 v19, v21, v2 +; SDAG-NEXT: .LBB1_6: ; %Flow16 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_or_b32_e32 v1, v13, v15 +; SDAG-NEXT: v_or_b32_e32 v0, v12, v14 +; SDAG-NEXT: v_or_b32_e32 v3, v5, v7 +; SDAG-NEXT: v_or_b32_e32 v2, v4, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v8, v14 +; SDAG-NEXT: v_ffbh_u32_e32 v9, v15 +; SDAG-NEXT: v_ffbh_u32_e32 v10, v12 +; SDAG-NEXT: v_ffbh_u32_e32 v11, v13 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v7 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v5 +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; SDAG-NEXT: v_add_i32_e64 v0, s[6:7], 32, v8 +; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v10 +; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v22 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v0, v0, v9 +; SDAG-NEXT: v_min_u32_e32 v1, v1, v11 +; SDAG-NEXT: v_min_u32_e32 v2, v2, v21 +; SDAG-NEXT: v_min_u32_e32 v3, v3, v23 +; SDAG-NEXT: v_add_i32_e32 v1, vcc, 64, v1 +; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0 +; SDAG-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v24, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v24, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v1, v3 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; SDAG-NEXT: v_and_b32_e32 v8, 1, v10 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_12 +; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 +; SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; SDAG-NEXT: v_sub_i32_e64 v9, s[4:5], 63, v0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc +; SDAG-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 +; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc +; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v8, v24 +; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0x7f, v0 +; SDAG-NEXT: v_or_b32_e32 v2, v11, v25 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v3 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v3 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v3 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[1:2] +; SDAG-NEXT: v_lshr_b64 v[0:1], v[4:5], v0 +; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 +; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v26, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v6, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_11 +; SDAG-NEXT: ; %bb.8: ; %udiv-preheader +; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v8 +; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 +; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v8 +; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v9, 0 +; SDAG-NEXT: v_mov_b32_e32 v10, 0 +; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v27 +; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v28 +; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc +; SDAG-NEXT: v_or_b32_e32 v21, v21, v32 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v31 +; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v21, v7, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v6, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v30, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v29, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: .LBB1_9: ; %udiv-do-while +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v5 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3 +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v20 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v30 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v31 +; SDAG-NEXT: v_or_b32_e32 v3, v10, v3 +; SDAG-NEXT: v_or_b32_e32 v1, v23, v1 +; SDAG-NEXT: v_or_b32_e32 v2, v9, v2 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v26, v4 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v27, v5, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v28, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v29, v7, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v23, 31, v20 +; SDAG-NEXT: v_and_b32_e32 v20, 1, v23 +; SDAG-NEXT: v_and_b32_e32 v30, v23, v15 +; SDAG-NEXT: v_and_b32_e32 v31, v23, v14 +; SDAG-NEXT: v_and_b32_e32 v32, v23, v13 +; SDAG-NEXT: v_and_b32_e32 v23, v23, v12 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v23 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v32, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v31, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v30, vcc +; SDAG-NEXT: v_add_i32_e32 v8, vcc, -1, v8 +; SDAG-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc +; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc +; SDAG-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc +; SDAG-NEXT: v_or_b32_e32 v31, v11, v25 +; SDAG-NEXT: v_or_b32_e32 v30, v8, v24 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31] +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v0, v22, v0 +; SDAG-NEXT: v_mov_b32_e32 v23, v21 +; SDAG-NEXT: v_mov_b32_e32 v22, v20 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB1_9 +; SDAG-NEXT: ; %bb.10: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB1_11: ; %Flow11 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; SDAG-NEXT: v_or_b32_e32 v2, v2, v4 +; SDAG-NEXT: v_or_b32_e32 v8, v10, v3 +; SDAG-NEXT: v_or_b32_e32 v10, v21, v1 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v2 +; SDAG-NEXT: v_or_b32_e32 v11, v20, v0 +; SDAG-NEXT: .LBB1_12: ; %Flow12 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v19 +; SDAG-NEXT: v_mov_b32_e32 v1, v18 +; SDAG-NEXT: v_mov_b32_e32 v2, v17 +; SDAG-NEXT: v_mov_b32_e32 v3, v16 +; SDAG-NEXT: v_mov_b32_e32 v4, v11 +; SDAG-NEXT: v_mov_b32_e32 v5, v10 +; SDAG-NEXT: v_mov_b32_e32 v6, v9 +; SDAG-NEXT: v_mov_b32_e32 v7, v8 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_udiv_v2i128_vv: +; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v16, v2 +; GISEL-NEXT: v_mov_b32_e32 v17, v3 +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_or_b32_e32 v2, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v3, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v18, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v19, v1, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v20, v9 +; GISEL-NEXT: v_ffbh_u32_e32 v21, v8 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v10 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v16 +; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v25, 0 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v21 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v23 +; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29 +; GISEL-NEXT: v_min_u32_e32 v2, v20, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v22, v3 +; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 +; GISEL-NEXT: v_min_u32_e32 v19, v28, v19 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v2, vcc, 64, v2 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v18, vcc +; GISEL-NEXT: v_sub_i32_e32 v20, vcc, v2, v3 +; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[20:21], v[24:25] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v20 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v21, v23 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v26, v18 +; GISEL-NEXT: v_and_b32_e32 v18, 1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v24, 1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB1_6 +; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v20 +; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v21, vcc +; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v20 +; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v22, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v23, vcc +; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v30 +; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v30 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v30 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[16:17], v30 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v20, v18 +; GISEL-NEXT: v_or_b32_e32 v3, v21, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v21, s11 +; GISEL-NEXT: v_mov_b32_e32 v20, s10 +; GISEL-NEXT: v_mov_b32_e32 v19, s9 +; GISEL-NEXT: v_mov_b32_e32 v18, s8 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB1_5 +; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 +; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v26 +; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v26 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v26 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v8 +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc +; GISEL-NEXT: v_lshl_b64 v[24:25], v[16:17], v24 +; GISEL-NEXT: v_lshr_b64 v[16:17], v[16:17], v32 +; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v20, v20, v24 +; GISEL-NEXT: v_or_b32_e32 v21, v21, v25 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v24, v20, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, v21, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v21, s7 +; GISEL-NEXT: v_mov_b32_e32 v20, s6 +; GISEL-NEXT: v_mov_b32_e32 v19, s5 +; GISEL-NEXT: v_mov_b32_e32 v18, s4 +; GISEL-NEXT: .LBB1_3: ; %udiv-do-while3 +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[22:23], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v25 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v35, 31, v3 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26 +; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc +; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GISEL-NEXT: v_or_b32_e32 v22, v18, v20 +; GISEL-NEXT: v_or_b32_e32 v23, v19, v21 +; GISEL-NEXT: v_or_b32_e32 v16, v16, v0 +; GISEL-NEXT: v_or_b32_e32 v20, v24, v35 +; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v20 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v25, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v26, v28 +; GISEL-NEXT: v_or_b32_e32 v19, v27, v29 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v32, v16, vcc +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v17, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v18, v0, v8 +; GISEL-NEXT: v_and_b32_e32 v19, v0, v9 +; GISEL-NEXT: v_and_b32_e32 v21, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v35, v0, v11 +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v20, v18 +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v19, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v21, vcc +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v2, v34 +; GISEL-NEXT: v_mov_b32_e32 v19, v1 +; GISEL-NEXT: v_mov_b32_e32 v18, v0 +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB1_3 +; GISEL-NEXT: ; %bb.4: ; %Flow13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v23 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v8 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v0 +; GISEL-NEXT: v_or_b32_e32 v19, v19, v1 +; GISEL-NEXT: .LBB1_6: ; %Flow16 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_or_b32_e32 v0, v12, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v13, v15 +; GISEL-NEXT: v_or_b32_e32 v8, v4, v6 +; GISEL-NEXT: v_or_b32_e32 v9, v5, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v16, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v17, v12 +; GISEL-NEXT: v_ffbh_u32_e32 v20, v15 +; GISEL-NEXT: v_ffbh_u32_e32 v21, v14 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v4 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v6 +; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v11, 0 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 32, v17 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v21 +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 32, v23 +; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], 32, v25 +; GISEL-NEXT: v_min_u32_e32 v0, v16, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v20, v1 +; GISEL-NEXT: v_min_u32_e32 v8, v22, v8 +; GISEL-NEXT: v_min_u32_e32 v9, v24, v9 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v0, vcc, 64, v0 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 64, v8 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v8, v8, v16 +; GISEL-NEXT: v_or_b32_e32 v9, v1, v17 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v9, v20, v10 +; GISEL-NEXT: v_and_b32_e32 v10, 1, v9 +; GISEL-NEXT: v_or_b32_e32 v8, v9, v8 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v20, 1, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB1_12 +; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v16, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v17, vcc +; GISEL-NEXT: v_subrev_i32_e64 v9, s[4:5], 64, v26 +; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v10 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[4:5], v9 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v20, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v21, v17 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v23, s11 +; GISEL-NEXT: v_mov_b32_e32 v22, s10 +; GISEL-NEXT: v_mov_b32_e32 v21, s9 +; GISEL-NEXT: v_mov_b32_e32 v20, s8 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB1_11 +; GISEL-NEXT: ; %bb.8: ; %udiv-preheader +; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8 +; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v8 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v12 +; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc +; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], v22 +; GISEL-NEXT: v_lshr_b64 v[6:7], v[6:7], v28 +; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v20, v20, v22 +; GISEL-NEXT: v_or_b32_e32 v21, v21, v23 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v20, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v21, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: v_mov_b32_e32 v23, s7 +; GISEL-NEXT: v_mov_b32_e32 v22, s6 +; GISEL-NEXT: v_mov_b32_e32 v21, s5 +; GISEL-NEXT: v_mov_b32_e32 v20, s4 +; GISEL-NEXT: .LBB1_9: ; %udiv-do-while +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], 1 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v7 +; GISEL-NEXT: v_lshrrev_b32_e32 v30, 31, v1 +; GISEL-NEXT: v_lshl_b64 v[6:7], v[9:10], 1 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v9, 31, v10 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, -1, v8 +; GISEL-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v16, v4 +; GISEL-NEXT: v_or_b32_e32 v22, v22, v30 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v9, v20, v6 +; GISEL-NEXT: v_or_b32_e32 v10, v21, v7 +; GISEL-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc +; GISEL-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v26, v22 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v27, v23, vcc +; GISEL-NEXT: v_or_b32_e32 v6, v8, v24 +; GISEL-NEXT: v_or_b32_e32 v7, v11, v25 +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v28, v16, vcc +; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v29, v17, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v4 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v4, 1, v6 +; GISEL-NEXT: v_and_b32_e32 v7, v6, v12 +; GISEL-NEXT: v_and_b32_e32 v30, v6, v13 +; GISEL-NEXT: v_and_b32_e32 v31, v6, v14 +; GISEL-NEXT: v_and_b32_e32 v32, v6, v15 +; GISEL-NEXT: v_mov_b32_e32 v21, v5 +; GISEL-NEXT: v_mov_b32_e32 v20, v4 +; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v22, v7 +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v23, v30, vcc +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v31, vcc +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v32, vcc +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB1_9 +; GISEL-NEXT: ; %bb.10: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1 +; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v10 +; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_or_b32_e32 v10, v20, v4 +; GISEL-NEXT: v_or_b32_e32 v11, v21, v5 +; GISEL-NEXT: .LBB1_12: ; %Flow12 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mov_b32_e32 v0, v18 +; GISEL-NEXT: v_mov_b32_e32 v1, v19 +; GISEL-NEXT: v_mov_b32_e32 v4, v10 +; GISEL-NEXT: v_mov_b32_e32 v5, v11 +; GISEL-NEXT: v_mov_b32_e32 v6, v8 +; GISEL-NEXT: v_mov_b32_e32 v7, v9 +; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = udiv <2 x i128> %lhs, %rhs ret <2 x i128> %shl } define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { +; SDAG-LABEL: v_srem_v2i128_vv: +; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 +; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v11 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_mov_b32_e32 v29, v28 +; SDAG-NEXT: v_xor_b32_e32 v18, v3, v28 +; SDAG-NEXT: v_xor_b32_e32 v19, v2, v28 +; SDAG-NEXT: v_xor_b32_e32 v1, v1, v28 +; SDAG-NEXT: v_xor_b32_e32 v0, v0, v28 +; SDAG-NEXT: v_xor_b32_e32 v11, v11, v16 +; SDAG-NEXT: v_xor_b32_e32 v10, v10, v16 +; SDAG-NEXT: v_xor_b32_e32 v20, v9, v16 +; SDAG-NEXT: v_xor_b32_e32 v9, v8, v16 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v28 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v28, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v19, v28, vcc +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v2, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v0 +; SDAG-NEXT: v_min_u32_e32 v19, v19, v21 +; SDAG-NEXT: v_sub_i32_e32 v31, vcc, v9, v16 +; SDAG-NEXT: v_or_b32_e32 v9, v3, v1 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v1 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v22, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v30, vcc, v20, v16, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] +; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v21 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v22, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v10, v16, vcc +; SDAG-NEXT: v_add_i32_e64 v21, s[8:9], 32, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v16, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v31, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v16, v8 +; SDAG-NEXT: v_min_u32_e32 v19, v21, v22 +; SDAG-NEXT: v_or_b32_e32 v11, v30, v9 +; SDAG-NEXT: v_add_i32_e32 v16, vcc, 32, v16 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v9 +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_min_u32_e32 v10, v16, v21 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v11, v22, 0, s[6:7] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v19, v10, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v18 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v10 +; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v17, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v11, v19 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v20 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v35, v1, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v27, v3, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v33, v2, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_6 +; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 +; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10 +; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc +; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20 +; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v32, v34 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10 +; SDAG-NEXT: v_or_b32_e32 v19, v33, v35 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24 +; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v24 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v25 +; SDAG-NEXT: v_or_b32_e32 v11, v11, v19 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v18 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 +; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_5 +; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 +; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v32 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32 +; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32 +; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32 +; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v26 +; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37 +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v17, v27 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v26 +; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v8, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v49, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v48, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v9, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32 +; SDAG-NEXT: v_cndmask_b32_e32 v25, v17, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v24, v16, v2, vcc +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3 +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v25 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v11 +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v26, v26, v16 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v48 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v49 +; SDAG-NEXT: v_or_b32_e32 v11, v19, v11 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v36, v24 +; SDAG-NEXT: v_or_b32_e32 v10, v18, v10 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v25, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v38, v26, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v39, v27, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v16 +; SDAG-NEXT: v_and_b32_e32 v48, v16, v31 +; SDAG-NEXT: v_and_b32_e32 v49, v16, v30 +; SDAG-NEXT: v_and_b32_e32 v50, v16, v8 +; SDAG-NEXT: v_and_b32_e32 v51, v16, v9 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v24, v48 +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc +; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v26, v50, vcc +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v51, vcc +; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32 +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc +; SDAG-NEXT: v_or_b32_e32 v48, v32, v34 +; SDAG-NEXT: v_or_b32_e32 v49, v33, v35 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[48:49] +; SDAG-NEXT: v_or_b32_e32 v21, v23, v21 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v20, v22, v20 +; SDAG-NEXT: v_mov_b32_e32 v23, v17 +; SDAG-NEXT: v_mov_b32_e32 v22, v16 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB2_3 +; SDAG-NEXT: ; %bb.4: ; %Flow13 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB2_5: ; %Flow14 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; SDAG-NEXT: v_or_b32_e32 v10, v10, v22 +; SDAG-NEXT: v_or_b32_e32 v35, v19, v11 +; SDAG-NEXT: v_or_b32_e32 v27, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v32, v18, v10 +; SDAG-NEXT: v_or_b32_e32 v33, v16, v20 +; SDAG-NEXT: .LBB2_6: ; %Flow16 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 +; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: v_mov_b32_e32 v34, v26 +; SDAG-NEXT: v_xor_b32_e32 v10, v7, v26 +; SDAG-NEXT: v_xor_b32_e32 v11, v6, v26 +; SDAG-NEXT: v_xor_b32_e32 v5, v5, v26 +; SDAG-NEXT: v_xor_b32_e32 v4, v4, v26 +; SDAG-NEXT: v_xor_b32_e32 v15, v15, v16 +; SDAG-NEXT: v_xor_b32_e32 v14, v14, v16 +; SDAG-NEXT: v_xor_b32_e32 v13, v13, v16 +; SDAG-NEXT: v_xor_b32_e32 v12, v12, v16 +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v4, v26 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v5, v26, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v5, v6 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v11, v26, vcc +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v7 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v10, v26, vcc +; SDAG-NEXT: v_or_b32_e32 v10, v6, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v19, v4 +; SDAG-NEXT: v_min_u32_e32 v18, v11, v18 +; SDAG-NEXT: v_sub_i32_e32 v37, vcc, v12, v16 +; SDAG-NEXT: v_or_b32_e32 v11, v7, v5 +; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], 32, v19 +; SDAG-NEXT: v_ffbh_u32_e32 v19, v5 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18 +; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v36, vcc, v13, v16, vcc +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; SDAG-NEXT: v_ffbh_u32_e32 v11, v37 +; SDAG-NEXT: v_min_u32_e32 v12, v12, v19 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v20, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v14, v16, vcc +; SDAG-NEXT: v_add_i32_e64 v13, s[8:9], 32, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v14, v36 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v15, v16, vcc +; SDAG-NEXT: v_or_b32_e32 v12, v37, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v15, v10 +; SDAG-NEXT: v_min_u32_e32 v14, v13, v14 +; SDAG-NEXT: v_or_b32_e32 v13, v36, v11 +; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; SDAG-NEXT: v_ffbh_u32_e32 v16, v11 +; SDAG-NEXT: v_add_i32_e32 v14, vcc, 64, v14 +; SDAG-NEXT: v_addc_u32_e64 v20, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] +; SDAG-NEXT: v_min_u32_e32 v12, v15, v16 +; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v20, 0, s[6:7] +; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, v14, v12, s[6:7] +; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v12, v18 +; SDAG-NEXT: v_subb_u32_e32 v13, vcc, v13, v19, vcc +; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v12 +; SDAG-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v17, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v17, vcc +; SDAG-NEXT: v_or_b32_e32 v16, v16, v14 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_and_b32_e32 v16, 1, v18 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v7, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v6, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_12 +; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 +; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v12 +; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v12 +; SDAG-NEXT: v_mov_b32_e32 v16, 0 +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v13, vcc +; SDAG-NEXT: v_lshl_b64 v[18:19], v[6:7], v18 +; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc +; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc +; SDAG-NEXT: v_or_b32_e32 v13, v38, v48 +; SDAG-NEXT: v_sub_i32_e32 v15, vcc, 0x7f, v12 +; SDAG-NEXT: v_or_b32_e32 v14, v39, v49 +; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v15 +; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v15 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v15 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[13:14] +; SDAG-NEXT: v_lshr_b64 v[12:13], v[6:7], v12 +; SDAG-NEXT: v_or_b32_e32 v13, v21, v13 +; SDAG-NEXT: v_or_b32_e32 v12, v20, v12 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v13, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v22, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_11 +; SDAG-NEXT: ; %bb.8: ; %udiv-preheader +; SDAG-NEXT: v_lshr_b64 v[16:17], v[6:7], v38 +; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 64, v38 +; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38 +; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38 +; SDAG-NEXT: v_add_i32_e32 v50, vcc, -1, v37 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v24 +; SDAG-NEXT: v_lshr_b64 v[53:54], v[4:5], v51 +; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v17, v25 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 +; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v10, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v38 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v54, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v53, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v22, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v53, vcc, -1, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38 +; SDAG-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc +; SDAG-NEXT: v_mov_b32_e32 v17, 0 +; SDAG-NEXT: .LBB2_9: ; %udiv-do-while +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v15 +; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v13 +; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; SDAG-NEXT: v_or_b32_e32 v24, v24, v16 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v54 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v55 +; SDAG-NEXT: v_or_b32_e32 v15, v19, v15 +; SDAG-NEXT: v_or_b32_e32 v13, v21, v13 +; SDAG-NEXT: v_or_b32_e32 v14, v18, v14 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v50, v22 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v51, v23, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v52, v24, vcc +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v53, v25, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v16 +; SDAG-NEXT: v_and_b32_e32 v16, 1, v21 +; SDAG-NEXT: v_and_b32_e32 v54, v21, v11 +; SDAG-NEXT: v_and_b32_e32 v55, v21, v10 +; SDAG-NEXT: v_and_b32_e32 v40, v21, v36 +; SDAG-NEXT: v_and_b32_e32 v21, v21, v37 +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v22, v21 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v40, vcc +; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v24, v55, vcc +; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v54, vcc +; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v38 +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc +; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v48, vcc +; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v49, vcc +; SDAG-NEXT: v_or_b32_e32 v55, v39, v49 +; SDAG-NEXT: v_or_b32_e32 v54, v38, v48 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55] +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v12, v20, v12 +; SDAG-NEXT: v_mov_b32_e32 v21, v17 +; SDAG-NEXT: v_mov_b32_e32 v20, v16 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB2_9 +; SDAG-NEXT: ; %bb.10: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB2_11: ; %Flow11 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v13 +; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v20 +; SDAG-NEXT: v_or_b32_e32 v19, v19, v15 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v13 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v14 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v12 +; SDAG-NEXT: .LBB2_12: ; %Flow12 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mul_lo_u32 v14, v33, v9 +; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v33, v8, 0 +; SDAG-NEXT: v_mul_lo_u32 v24, v27, v8 +; SDAG-NEXT: v_mul_lo_u32 v25, v35, v31 +; SDAG-NEXT: v_mul_lo_u32 v35, v32, v30 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v33, 0 +; SDAG-NEXT: v_mov_b32_e32 v15, 0 +; SDAG-NEXT: v_mul_lo_u32 v38, v16, v11 +; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v39, v17, v10 +; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37 +; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v37, v16, 0 +; SDAG-NEXT: v_add_i32_e32 v13, vcc, v13, v14 +; SDAG-NEXT: v_mov_b32_e32 v14, v9 +; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15] +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v21, v38 +; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v13, v24 +; SDAG-NEXT: v_mov_b32_e32 v24, v23 +; SDAG-NEXT: v_mov_b32_e32 v23, v15 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v27, v[22:23] +; SDAG-NEXT: v_xor_b32_e32 v33, v2, v28 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v14, v39 +; SDAG-NEXT: v_mov_b32_e32 v14, v11 +; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v36, v16, v[14:15] +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v32, v31, v[12:13] +; SDAG-NEXT: v_mov_b32_e32 v2, v9 +; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v24, v2 +; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v2, v8 +; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v3, v2, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21] +; SDAG-NEXT: v_mov_b32_e32 v18, v23 +; SDAG-NEXT: v_mov_b32_e32 v23, v15 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v37, v17, v[22:23] +; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v25, v12 +; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v30, v27, v[13:14] +; SDAG-NEXT: v_xor_b32_e32 v16, v16, v29 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3 +; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v18, v9 +; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v18, v8 +; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v35, v20 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v36, v17, v[14:15] +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 +; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v13, v19, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v2 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v3, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc +; SDAG-NEXT: v_xor_b32_e32 v2, v0, v28 +; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29 +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v33, v28 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v16, v29, vcc +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v10 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v18, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26 +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v8, vcc +; SDAG-NEXT: v_xor_b32_e32 v7, v7, v34 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v9, vcc +; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26 +; SDAG-NEXT: v_xor_b32_e32 v9, v5, v34 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v7, v34, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v8, v26, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v34, vcc +; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_srem_v2i128_vv: +; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3 +; GISEL-NEXT: v_ashrrev_i32_e32 v20, 31, v11 +; GISEL-NEXT: v_mov_b32_e32 v18, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v19, 0 +; GISEL-NEXT: v_xor_b32_e32 v0, v0, v28 +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 +; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28 +; GISEL-NEXT: v_xor_b32_e32 v8, v8, v20 +; GISEL-NEXT: v_xor_b32_e32 v9, v9, v20 +; GISEL-NEXT: v_xor_b32_e32 v10, v10, v20 +; GISEL-NEXT: v_xor_b32_e32 v11, v11, v20 +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v28 +; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v28, vcc +; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v20 +; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v20, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v2, v28, vcc +; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v28, vcc +; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v20, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v20, v29 +; GISEL-NEXT: v_ffbh_u32_e32 v21, v30 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v17 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v30, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v29, v11 +; GISEL-NEXT: v_or_b32_e32 v2, v16, v8 +; GISEL-NEXT: v_or_b32_e32 v3, v17, v9 +; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v9 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v8 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] +; GISEL-NEXT: v_min_u32_e32 v0, v20, v21 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25 +; GISEL-NEXT: v_min_u32_e32 v2, v22, v23 +; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v24, v1 +; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 +; GISEL-NEXT: v_min_u32_e32 v3, v26, v3 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[18:19] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v18, v2 +; GISEL-NEXT: v_or_b32_e32 v19, v1, v3 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v19, v20, v21 +; GISEL-NEXT: v_and_b32_e32 v20, 1, v19 +; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v20, 1, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 +; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v2, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v3, vcc +; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24 +; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 64, v24 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v24 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[8:9], v24 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v18 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v18, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v19, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GISEL-NEXT: v_cndmask_b32_e32 v18, v0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, v1, v9, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB2_5 +; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 +; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v31 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v31 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[8:9], v31 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v31 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v35, vcc, -1, v30 +; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v29, vcc +; GISEL-NEXT: v_lshl_b64 v[22:23], v[8:9], v22 +; GISEL-NEXT: v_lshr_b64 v[24:25], v[8:9], v24 +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v11, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v3, v23 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v31 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31 +; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v17, vcc +; GISEL-NEXT: v_mov_b32_e32 v23, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3 +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21 +; GISEL-NEXT: v_lshl_b64 v[48:49], v[24:25], 1 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25 +; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v19 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; GISEL-NEXT: v_add_i32_e32 v31, vcc, -1, v31 +; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v26, v24 +; GISEL-NEXT: v_or_b32_e32 v3, v48, v25 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v35, v3 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v49, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v31, v33 +; GISEL-NEXT: v_or_b32_e32 v1, v32, v34 +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v37, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v27, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v1, v0, v30 +; GISEL-NEXT: v_and_b32_e32 v25, v0, v29 +; GISEL-NEXT: v_and_b32_e32 v26, v0, v10 +; GISEL-NEXT: v_and_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v49, v25, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v0, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v22 +; GISEL-NEXT: v_mov_b32_e32 v1, v23 +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB2_3 +; GISEL-NEXT: ; %bb.4: ; %Flow13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v20, 31, v21 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v20 +; GISEL-NEXT: v_or_b32_e32 v31, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v32, v1, v3 +; GISEL-NEXT: .LBB2_6: ; %Flow16 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_ashrrev_i32_e32 v33, 31, v7 +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v15 +; GISEL-NEXT: v_mov_b32_e32 v2, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_xor_b32_e32 v1, v4, v33 +; GISEL-NEXT: v_xor_b32_e32 v4, v5, v33 +; GISEL-NEXT: v_xor_b32_e32 v5, v6, v33 +; GISEL-NEXT: v_xor_b32_e32 v7, v7, v33 +; GISEL-NEXT: v_xor_b32_e32 v6, v12, v0 +; GISEL-NEXT: v_xor_b32_e32 v20, v13, v0 +; GISEL-NEXT: v_xor_b32_e32 v14, v14, v0 +; GISEL-NEXT: v_xor_b32_e32 v15, v15, v0 +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v1, v33 +; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v4, v33, vcc +; GISEL-NEXT: v_sub_i32_e64 v35, s[4:5], v6, v0 +; GISEL-NEXT: v_subb_u32_e64 v34, s[4:5], v20, v0, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v5, v33, vcc +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v33, vcc +; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v0, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v0, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v20, v34 +; GISEL-NEXT: v_ffbh_u32_e32 v21, v35 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v35, v4 +; GISEL-NEXT: v_or_b32_e32 v1, v34, v5 +; GISEL-NEXT: v_or_b32_e32 v14, v12, v6 +; GISEL-NEXT: v_or_b32_e32 v15, v13, v7 +; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v4 +; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v6 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] +; GISEL-NEXT: v_min_u32_e32 v0, v20, v21 +; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25 +; GISEL-NEXT: v_min_u32_e32 v14, v22, v23 +; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0 +; GISEL-NEXT: v_min_u32_e32 v1, v24, v1 +; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 64, v14 +; GISEL-NEXT: v_min_u32_e32 v15, v26, v15 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v14, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v0 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 +; GISEL-NEXT: v_or_b32_e32 v3, v1, v15 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v20, v21 +; GISEL-NEXT: v_and_b32_e32 v20, 1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v3, v2 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v20, v12, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v22, 1, v2 +; GISEL-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB2_12 +; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 +; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v0 +; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v1, vcc +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0 +; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v14, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v15, vcc +; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24 +; GISEL-NEXT: v_sub_i32_e64 v14, s[4:5], 64, v24 +; GISEL-NEXT: v_lshl_b64 v[0:1], v[12:13], v24 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], v24 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[14:15], v[12:13], v14 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[12:13], v20 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 +; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v14, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v15, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 +; GISEL-NEXT: v_cndmask_b32_e32 v14, v0, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v15, v1, v7, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, s8 +; GISEL-NEXT: v_mov_b32_e32 v1, s9 +; GISEL-NEXT: v_mov_b32_e32 v2, s10 +; GISEL-NEXT: v_mov_b32_e32 v3, s11 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB2_11 +; GISEL-NEXT: ; %bb.8: ; %udiv-preheader +; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v36 +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v36 +; GISEL-NEXT: v_lshr_b64 v[0:1], v[6:7], v36 +; GISEL-NEXT: v_lshr_b64 v[2:3], v[12:13], v36 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v48, vcc, -1, v35 +; GISEL-NEXT: v_addc_u32_e32 v49, vcc, -1, v34, vcc +; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], v22 +; GISEL-NEXT: v_lshr_b64 v[24:25], v[6:7], v24 +; GISEL-NEXT: v_addc_u32_e32 v50, vcc, -1, v4, vcc +; GISEL-NEXT: v_addc_u32_e32 v51, vcc, -1, v5, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 +; GISEL-NEXT: v_or_b32_e32 v3, v3, v23 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v36 +; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36 +; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v12, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v13, vcc +; GISEL-NEXT: v_mov_b32_e32 v23, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: .LBB2_9: ; %udiv-do-while +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21 +; GISEL-NEXT: v_lshl_b64 v[52:53], v[24:25], 1 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25 +; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v15 +; GISEL-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 +; GISEL-NEXT: v_add_i32_e32 v36, vcc, -1, v36 +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc +; GISEL-NEXT: v_or_b32_e32 v20, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v2, v26, v24 +; GISEL-NEXT: v_or_b32_e32 v3, v52, v25 +; GISEL-NEXT: v_or_b32_e32 v14, v14, v22 +; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v38, vcc +; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v48, v3 +; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v49, v53, vcc +; GISEL-NEXT: v_or_b32_e32 v0, v36, v38 +; GISEL-NEXT: v_or_b32_e32 v1, v37, v39 +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v50, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v51, v27, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v22, 1, v0 +; GISEL-NEXT: v_and_b32_e32 v1, v0, v35 +; GISEL-NEXT: v_and_b32_e32 v25, v0, v34 +; GISEL-NEXT: v_and_b32_e32 v26, v0, v4 +; GISEL-NEXT: v_and_b32_e32 v52, v0, v5 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1 +; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v53, v25, vcc +; GISEL-NEXT: v_mov_b32_e32 v0, v22 +; GISEL-NEXT: v_mov_b32_e32 v1, v23 +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB2_9 +; GISEL-NEXT: ; %bb.10: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], 1 +; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v21 +; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 +; GISEL-NEXT: v_or_b32_e32 v20, v0, v22 +; GISEL-NEXT: v_or_b32_e32 v21, v1, v23 +; GISEL-NEXT: .LBB2_12: ; %Flow12 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0 +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0 +; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19 +; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18 +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0 +; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3 +; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23] +; GISEL-NEXT: v_mov_b32_e32 v22, v19 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3] +; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2] +; GISEL-NEXT: v_mov_b32_e32 v23, v14 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2] +; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23] +; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc +; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28 +; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18 +; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5] +; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33 +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4] +; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1] +; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33 +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13] +; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28 +; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4] +; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33 +; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc +; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc +; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28 +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33 +; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28 +; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33 +; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7] +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = srem <2 x i128> %lhs, %rhs ret <2 x i128> %shl } define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { +; SDAG-LABEL: v_urem_v2i128_vv: +; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v10 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v9 +; SDAG-NEXT: v_ffbh_u32_e32 v24, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v25, v3 +; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 +; SDAG-NEXT: v_mov_b32_e32 v28, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 +; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 +; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 +; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 +; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 +; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 +; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v19, v17, v21 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_6 +; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v16 +; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 +; SDAG-NEXT: v_mov_b32_e32 v18, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v17, vcc +; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22 +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc +; SDAG-NEXT: v_or_b32_e32 v20, v30, v32 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v21, v31, v33 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[2:3], v26 +; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v26 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v27 +; SDAG-NEXT: v_or_b32_e32 v17, v17, v21 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_5 +; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4 +; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v30 +; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30 +; SDAG-NEXT: v_lshr_b64 v[26:27], v[2:3], v30 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v8 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v28 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[2:3], v35 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc +; SDAG-NEXT: v_or_b32_e32 v19, v19, v29 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v28 +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v38, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v37, v18, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3 +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v23 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; SDAG-NEXT: v_or_b32_e32 v28, v28, v18 +; SDAG-NEXT: v_or_b32_e32 v26, v26, v38 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v39 +; SDAG-NEXT: v_or_b32_e32 v17, v21, v17 +; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v34, v26 +; SDAG-NEXT: v_or_b32_e32 v16, v20, v16 +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v35, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v36, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v37, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v18 +; SDAG-NEXT: v_and_b32_e32 v39, v38, v8 +; SDAG-NEXT: v_and_b32_e32 v48, v38, v9 +; SDAG-NEXT: v_and_b32_e32 v49, v38, v10 +; SDAG-NEXT: v_and_b32_e32 v18, 1, v38 +; SDAG-NEXT: v_and_b32_e32 v38, v38, v11 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v39 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v49, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v38, vcc +; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; SDAG-NEXT: v_or_b32_e32 v38, v30, v32 +; SDAG-NEXT: v_or_b32_e32 v39, v31, v33 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39] +; SDAG-NEXT: v_or_b32_e32 v23, v25, v23 +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v22, v24, v22 +; SDAG-NEXT: v_mov_b32_e32 v25, v19 +; SDAG-NEXT: v_mov_b32_e32 v24, v18 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB3_3 +; SDAG-NEXT: ; %bb.4: ; %Flow13 +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB3_5: ; %Flow14 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23 +; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 +; SDAG-NEXT: v_or_b32_e32 v33, v21, v17 +; SDAG-NEXT: v_or_b32_e32 v30, v19, v23 +; SDAG-NEXT: v_or_b32_e32 v31, v20, v16 +; SDAG-NEXT: v_or_b32_e32 v32, v18, v22 +; SDAG-NEXT: .LBB3_6: ; %Flow16 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_or_b32_e32 v17, v13, v15 +; SDAG-NEXT: v_or_b32_e32 v16, v12, v14 +; SDAG-NEXT: v_or_b32_e32 v19, v5, v7 +; SDAG-NEXT: v_or_b32_e32 v18, v4, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v20, v14 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v15 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v12 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v13 +; SDAG-NEXT: v_ffbh_u32_e32 v24, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v25, v7 +; SDAG-NEXT: v_ffbh_u32_e32 v26, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v27, v5 +; SDAG-NEXT: v_mov_b32_e32 v28, 0 +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 +; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 +; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 +; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 +; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 +; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 +; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 +; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18 +; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc +; SDAG-NEXT: v_xor_b32_e32 v20, 0x7f, v16 +; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v28, vcc +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v28, vcc +; SDAG-NEXT: v_or_b32_e32 v20, v20, v18 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v21, v17, v19 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_and_b32_e32 v20, 1, v22 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20 +; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5] +; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v21, v5, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v4, 0, s[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_12 +; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 +; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 +; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16 +; SDAG-NEXT: v_mov_b32_e32 v20, 0 +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v17, vcc +; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22 +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v18, vcc +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v19, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v34, v36 +; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16 +; SDAG-NEXT: v_or_b32_e32 v18, v35, v37 +; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v19 +; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v19 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18] +; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v16 +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v17, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v26, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_11 +; SDAG-NEXT: ; %bb.8: ; %udiv-preheader +; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v34 +; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34 +; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34 +; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34 +; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v12 +; SDAG-NEXT: s_mov_b64 s[10:11], 0 +; SDAG-NEXT: v_mov_b32_e32 v24, 0 +; SDAG-NEXT: v_mov_b32_e32 v25, 0 +; SDAG-NEXT: v_mov_b32_e32 v22, 0 +; SDAG-NEXT: v_mov_b32_e32 v23, 0 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v28 +; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39 +; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc +; SDAG-NEXT: v_or_b32_e32 v21, v21, v29 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v28 +; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 +; SDAG-NEXT: v_cndmask_b32_e64 v21, v50, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v20, v49, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5] +; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 +; SDAG-NEXT: v_cndmask_b32_e32 v27, v21, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v26, v20, v4, vcc +; SDAG-NEXT: v_mov_b32_e32 v21, 0 +; SDAG-NEXT: .LBB3_9: ; %udiv-do-while +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v27 +; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19 +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v28, v28, v20 +; SDAG-NEXT: v_or_b32_e32 v26, v26, v50 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v51 +; SDAG-NEXT: v_or_b32_e32 v19, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v17, v25, v17 +; SDAG-NEXT: v_or_b32_e32 v18, v22, v18 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v38, v26 +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v39, v27, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v48, v28, vcc +; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v49, v29, vcc +; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v20 +; SDAG-NEXT: v_and_b32_e32 v20, 1, v25 +; SDAG-NEXT: v_and_b32_e32 v50, v25, v15 +; SDAG-NEXT: v_and_b32_e32 v51, v25, v14 +; SDAG-NEXT: v_and_b32_e32 v52, v25, v13 +; SDAG-NEXT: v_and_b32_e32 v25, v25, v12 +; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v25 +; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc +; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v51, vcc +; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v50, vcc +; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34 +; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc +; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc +; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc +; SDAG-NEXT: v_or_b32_e32 v51, v35, v37 +; SDAG-NEXT: v_or_b32_e32 v50, v34, v36 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51] +; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; SDAG-NEXT: v_or_b32_e32 v16, v24, v16 +; SDAG-NEXT: v_mov_b32_e32 v25, v21 +; SDAG-NEXT: v_mov_b32_e32 v24, v20 +; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11] +; SDAG-NEXT: s_cbranch_execnz .LBB3_9 +; SDAG-NEXT: ; %bb.10: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] +; SDAG-NEXT: .LBB3_11: ; %Flow11 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17 +; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1 +; SDAG-NEXT: v_or_b32_e32 v18, v18, v24 +; SDAG-NEXT: v_or_b32_e32 v23, v23, v19 +; SDAG-NEXT: v_or_b32_e32 v21, v21, v17 +; SDAG-NEXT: v_or_b32_e32 v22, v22, v18 +; SDAG-NEXT: v_or_b32_e32 v20, v20, v16 +; SDAG-NEXT: .LBB3_12: ; %Flow12 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mul_lo_u32 v18, v32, v11 +; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10 +; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8 +; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0 +; SDAG-NEXT: v_mov_b32_e32 v19, 0 +; SDAG-NEXT: v_mul_lo_u32 v34, v20, v15 +; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v20, v14, 0 +; SDAG-NEXT: v_mul_lo_u32 v35, v21, v14 +; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12 +; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13 +; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v20, 0 +; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v18 +; SDAG-NEXT: v_mov_b32_e32 v18, v11 +; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[18:19] +; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10 +; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v25, v34 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v28 +; SDAG-NEXT: v_mov_b32_e32 v28, v27 +; SDAG-NEXT: v_mov_b32_e32 v27, v19 +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[26:27] +; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v18, v35 +; SDAG-NEXT: v_mov_b32_e32 v18, v15 +; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v20, v[18:19] +; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v31, v8, v[16:17] +; SDAG-NEXT: v_mov_b32_e32 v8, v11 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v28, v8 +; SDAG-NEXT: v_addc_u32_e64 v18, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mov_b32_e32 v8, v10 +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25] +; SDAG-NEXT: v_mov_b32_e32 v22, v27 +; SDAG-NEXT: v_mov_b32_e32 v27, v19 +; SDAG-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v12, v21, v[26:27] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v29, v16 +; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[17:18] +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v11 +; SDAG-NEXT: v_mov_b32_e32 v11, v20 +; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v22, v11 +; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v33, v16 +; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v17 +; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v21, v[11:12] +; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15 +; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v16, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc +; SDAG-NEXT: v_add_i32_e32 v8, vcc, v11, v10 +; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v12, v17, vcc +; SDAG-NEXT: v_mov_b32_e32 v10, v19 +; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14 +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v10, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_urem_v2i128_vv: +; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_or_b32_e32 v16, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v17, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v18, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v19, v1, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v9 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v8 +; GISEL-NEXT: v_ffbh_u32_e32 v24, v11 +; GISEL-NEXT: v_ffbh_u32_e32 v25, v10 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v1 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v3 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v2 +; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v21, 0 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23 +; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v25 +; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29 +; GISEL-NEXT: v_min_u32_e32 v16, v22, v16 +; GISEL-NEXT: v_min_u32_e32 v17, v24, v17 +; GISEL-NEXT: v_min_u32_e32 v18, v26, v18 +; GISEL-NEXT: v_min_u32_e32 v19, v28, v19 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v16 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v20, v20, v18 +; GISEL-NEXT: v_or_b32_e32 v21, v17, v19 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v21, v22, v23 +; GISEL-NEXT: v_and_b32_e32 v22, 1, v21 +; GISEL-NEXT: v_or_b32_e32 v20, v21, v20 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v22, 1, v20 +; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB3_6 +; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v17, vcc +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; GISEL-NEXT: v_addc_u32_e64 v32, vcc, 0, v18, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc +; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v26 +; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v26 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[0:1], v26 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[2:3], v26 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20 +; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v20, v18 +; GISEL-NEXT: v_or_b32_e32 v17, v21, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, v25, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26 +; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v3, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: v_mov_b32_e32 v18, s10 +; GISEL-NEXT: v_mov_b32_e32 v17, s9 +; GISEL-NEXT: v_mov_b32_e32 v16, s8 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB3_5 +; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 +; GISEL-NEXT: v_subrev_i32_e32 v26, vcc, 64, v30 +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v30 +; GISEL-NEXT: v_lshr_b64 v[16:17], v[2:3], v30 +; GISEL-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v8 +; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc +; GISEL-NEXT: v_lshl_b64 v[24:25], v[2:3], v24 +; GISEL-NEXT: v_lshr_b64 v[26:27], v[2:3], v26 +; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v18, v18, v24 +; GISEL-NEXT: v_or_b32_e32 v19, v19, v25 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v18, v26, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v28, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v29, 0, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 +; GISEL-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc +; GISEL-NEXT: v_mov_b32_e32 v25, 0 +; GISEL-NEXT: v_mov_b32_e32 v19, s7 +; GISEL-NEXT: v_mov_b32_e32 v18, s6 +; GISEL-NEXT: v_mov_b32_e32 v17, s5 +; GISEL-NEXT: v_mov_b32_e32 v16, s4 +; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3 +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[38:39], v[26:27], 1 +; GISEL-NEXT: v_lshl_b64 v[28:29], v[28:29], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v27 +; GISEL-NEXT: v_lshrrev_b32_e32 v27, 31, v21 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v30 +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc +; GISEL-NEXT: v_or_b32_e32 v22, v16, v18 +; GISEL-NEXT: v_or_b32_e32 v23, v17, v19 +; GISEL-NEXT: v_or_b32_e32 v18, v28, v26 +; GISEL-NEXT: v_or_b32_e32 v19, v38, v27 +; GISEL-NEXT: v_or_b32_e32 v20, v20, v24 +; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v34, v19 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v39, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v30, v32 +; GISEL-NEXT: v_or_b32_e32 v17, v31, v33 +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v36, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v29, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v24 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v24, 1, v16 +; GISEL-NEXT: v_and_b32_e32 v17, v16, v8 +; GISEL-NEXT: v_and_b32_e32 v27, v16, v9 +; GISEL-NEXT: v_and_b32_e32 v28, v16, v10 +; GISEL-NEXT: v_and_b32_e32 v16, v16, v11 +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v19, v17 +; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v39, v27, vcc +; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v16, vcc +; GISEL-NEXT: v_mov_b32_e32 v16, v24 +; GISEL-NEXT: v_mov_b32_e32 v17, v25 +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB3_3 +; GISEL-NEXT: ; %bb.4: ; %Flow13 +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_5: ; %Flow14 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 +; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23 +; GISEL-NEXT: v_or_b32_e32 v20, v20, v22 +; GISEL-NEXT: v_or_b32_e32 v32, v16, v18 +; GISEL-NEXT: v_or_b32_e32 v33, v17, v19 +; GISEL-NEXT: .LBB3_6: ; %Flow16 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_mov_b64 s[8:9], 0 +; GISEL-NEXT: v_or_b32_e32 v16, v12, v14 +; GISEL-NEXT: v_or_b32_e32 v17, v13, v15 +; GISEL-NEXT: v_or_b32_e32 v18, v4, v6 +; GISEL-NEXT: v_or_b32_e32 v19, v5, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v22, v13 +; GISEL-NEXT: v_ffbh_u32_e32 v23, v12 +; GISEL-NEXT: v_ffbh_u32_e32 v26, v15 +; GISEL-NEXT: v_ffbh_u32_e32 v27, v14 +; GISEL-NEXT: v_ffbh_u32_e32 v28, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v29, v4 +; GISEL-NEXT: v_ffbh_u32_e32 v30, v7 +; GISEL-NEXT: v_ffbh_u32_e32 v31, v6 +; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v25, 0 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] +; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23 +; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v27 +; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v29 +; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v31 +; GISEL-NEXT: v_min_u32_e32 v16, v22, v16 +; GISEL-NEXT: v_min_u32_e32 v17, v26, v17 +; GISEL-NEXT: v_min_u32_e32 v18, v28, v18 +; GISEL-NEXT: v_min_u32_e32 v19, v30, v19 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16 +; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[24:25] +; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v16 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 +; GISEL-NEXT: v_or_b32_e32 v19, v17, v23 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v19, v26, v24 +; GISEL-NEXT: v_and_b32_e32 v24, 1, v19 +; GISEL-NEXT: v_or_b32_e32 v18, v19, v18 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24 +; GISEL-NEXT: v_cndmask_b32_e64 v24, v4, 0, vcc +; GISEL-NEXT: v_and_b32_e32 v26, 1, v18 +; GISEL-NEXT: v_cndmask_b32_e64 v25, v5, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v18, v6, 0, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB3_12 +; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v17, vcc +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v16 +; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v22, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v23, vcc +; GISEL-NEXT: v_subrev_i32_e64 v24, s[4:5], 64, v28 +; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], 64, v28 +; GISEL-NEXT: v_lshl_b64 v[16:17], v[4:5], v28 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[6:7], v28 +; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1 +; GISEL-NEXT: v_lshr_b64 v[22:23], v[4:5], v22 +; GISEL-NEXT: v_lshl_b64 v[26:27], v[4:5], v24 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28 +; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v17, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v22, v18 +; GISEL-NEXT: v_or_b32_e32 v17, v23, v19 +; GISEL-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v17, v27, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28 +; GISEL-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc +; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v19, s11 +; GISEL-NEXT: v_mov_b32_e32 v18, s10 +; GISEL-NEXT: v_mov_b32_e32 v17, s9 +; GISEL-NEXT: v_mov_b32_e32 v16, s8 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB3_11 +; GISEL-NEXT: ; %bb.8: ; %udiv-preheader +; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v34 +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 64, v34 +; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v34 +; GISEL-NEXT: v_lshr_b64 v[18:19], v[4:5], v34 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_add_i32_e32 v38, vcc, -1, v12 +; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc +; GISEL-NEXT: v_lshl_b64 v[26:27], v[6:7], v26 +; GISEL-NEXT: v_lshr_b64 v[28:29], v[6:7], v28 +; GISEL-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc +; GISEL-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_or_b32_e32 v18, v18, v26 +; GISEL-NEXT: v_or_b32_e32 v19, v19, v27 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v34 +; GISEL-NEXT: v_cndmask_b32_e32 v18, v28, v18, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v19, v29, v19, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v30, 0, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v31, 0, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34 +; GISEL-NEXT: v_cndmask_b32_e32 v28, v18, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v29, v19, v5, vcc +; GISEL-NEXT: v_mov_b32_e32 v27, 0 +; GISEL-NEXT: v_mov_b32_e32 v19, s7 +; GISEL-NEXT: v_mov_b32_e32 v18, s6 +; GISEL-NEXT: v_mov_b32_e32 v17, s5 +; GISEL-NEXT: v_mov_b32_e32 v16, s4 +; GISEL-NEXT: .LBB3_9: ; %udiv-do-while +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v25 +; GISEL-NEXT: v_lshl_b64 v[50:51], v[28:29], 1 +; GISEL-NEXT: v_lshl_b64 v[30:31], v[30:31], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v28, 31, v29 +; GISEL-NEXT: v_lshrrev_b32_e32 v29, 31, v23 +; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v34 +; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc +; GISEL-NEXT: v_or_b32_e32 v24, v16, v18 +; GISEL-NEXT: v_or_b32_e32 v25, v17, v19 +; GISEL-NEXT: v_or_b32_e32 v18, v30, v28 +; GISEL-NEXT: v_or_b32_e32 v19, v50, v29 +; GISEL-NEXT: v_or_b32_e32 v22, v22, v26 +; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v38, v19 +; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v39, v51, vcc +; GISEL-NEXT: v_or_b32_e32 v16, v34, v36 +; GISEL-NEXT: v_or_b32_e32 v17, v35, v37 +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v48, v18, vcc +; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v49, v31, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v26 +; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v26, 1, v16 +; GISEL-NEXT: v_and_b32_e32 v17, v16, v12 +; GISEL-NEXT: v_and_b32_e32 v29, v16, v13 +; GISEL-NEXT: v_and_b32_e32 v30, v16, v14 +; GISEL-NEXT: v_and_b32_e32 v50, v16, v15 +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17 +; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v51, v29, vcc +; GISEL-NEXT: v_mov_b32_e32 v16, v26 +; GISEL-NEXT: v_mov_b32_e32 v17, v27 +; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v30, vcc +; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v50, vcc +; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GISEL-NEXT: s_cbranch_execnz .LBB3_9 +; GISEL-NEXT: ; %bb.10: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_11: ; %Flow11 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_b64 v[26:27], v[24:25], 1 +; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1 +; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v25 +; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 +; GISEL-NEXT: v_or_b32_e32 v24, v16, v26 +; GISEL-NEXT: v_or_b32_e32 v25, v17, v27 +; GISEL-NEXT: .LBB3_12: ; %Flow12 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0 +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0 +; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21 +; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20 +; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0 +; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0 +; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19 +; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18 +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19] +; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23] +; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18] +; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22] +; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18] +; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22] +; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7] +; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 +; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc +; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20 +; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5] +; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18] +; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17] +; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19] +; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc +; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc +; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5] +; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc +; GISEL-NEXT: s_setpc_b64 s[30:31] %shl = urem <2 x i128> %lhs, %rhs ret <2 x i128> %shl } diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll index 0069370..05558c5 100644 --- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll +++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll @@ -42,6 +42,6 @@ attributes #0 = { "amdgpu-no-dispatch-id" } ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index 4ed1b8a..e198197 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -471,25 +471,15 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee ret void } -; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode: -; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GCN-DENORM-NOT: v_max -; GCN-DENORM-NOT: v_mul - -; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; GCN-DENORM-NOT: v_max -; GCN-DENORM-NOT: v_mul - -; GFX9: {{flat|global}}_store_dword -define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 { - %id = tail call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id - %load = load float, ptr addrspace(1) %gep, align 4 - %v = tail call float @llvm.minnum.f32(float %load, float 0.0) - %canonicalized = tail call float @llvm.canonicalize.f32(float %v) - store float %canonicalized, ptr addrspace(1) %gep, align 4 - ret void -} +; define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 { +; %id = tail call i32 @llvm.amdgcn.workitem.id.x() +; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id +; %load = load float, ptr addrspace(1) %gep, align 4 +; %v = tail call float @llvm.minnum.f32(float %load, float 0.0) +; %canonicalized = tail call float @llvm.canonicalize.f32(float %v) +; store float %canonicalized, ptr addrspace(1) %gep, align 4 +; ret void +; } ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32: ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} @@ -523,32 +513,15 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1 ret void } -; GCN-LABEL: test_fold_canonicalize_denorm_value_f32: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] - -; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] -; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]] - -; GFX9-FLUSH: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] -; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]] - -; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]] -; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]] - -; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]] - -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] -define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) { - %id = tail call i32 @llvm.amdgcn.workitem.id.x() - %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id - %load = load float, ptr addrspace(1) %gep, align 4 - %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float)) - %canonicalized = tail call float @llvm.canonicalize.f32(float %v) - store float %canonicalized, ptr addrspace(1) %gep, align 4 - ret void -} +; define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) { +; %id = tail call i32 @llvm.amdgcn.workitem.id.x() +; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id +; %load = load float, ptr addrspace(1) %gep, align 4 +; %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float)) +; %canonicalized = tail call float @llvm.canonicalize.f32(float %v) +; store float %canonicalized, ptr addrspace(1) %gep, align 4 +; ret void +; } ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode: ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] @@ -674,10 +647,9 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrsp } ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16 -; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]], -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_short v{{.+}}, [[V]] +; GCN: {{flat|global}}_load_ushort [[V1:v[0-9]+]], +; GCN: v_max_f16_e32 [[V2:v[0-9]+]], [[V1]], [[V1]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V2]] define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id @@ -807,18 +779,13 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) { ret half %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16: -; GFX9: v_mul_f16_e32 -; GFX9: v_pk_mul_f16 -; GFX9-NOT: v_max -; GFX9-NOT: v_pk_max -define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { - %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0> - %ins.op = fmul half %val, 8.0 - %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx - %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins) - ret <2 x half> %canonicalized -} +; define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { +; %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0> +; %ins.op = fmul half %val, 8.0 +; %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx +; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins) +; ret <2 x half> %canonicalized +; } ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16: ; GFX9: v_mul_f16 @@ -842,15 +809,11 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x ret <2 x half> %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz: -; GCN: s_waitcnt -; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 -define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) { - %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) - %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt) - ret <2 x half> %canonicalized -} +; define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) { +; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b) +; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt) +; ret <2 x half> %canonicalized +; } ; GCN-LABEL: {{^}}v_test_canonicalize_cubeid: ; GCN: s_waitcnt diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 27462130..581b7b4 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -94,7 +94,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -147,7 +146,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -170,6 +168,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1 ret void } +define half @s_test_canonicalize_arg(half %x) #1 { +; VI-LABEL: s_test_canonicalize_arg: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: s_test_canonicalize_arg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; CI-LABEL: s_test_canonicalize_arg: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; CI-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: s_test_canonicalize_arg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %canonicalized = call half @llvm.canonicalize.f16(half %x) + ret half %canonicalized +} + define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 { ; VI-LABEL: v_test_canonicalize_build_vector_v2f16: ; VI: ; %bb.0: @@ -242,7 +269,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -299,7 +325,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1 ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -357,7 +382,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -414,7 +438,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -471,7 +494,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt ; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0| -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; CI-NEXT: s_endpgm @@ -1246,9 +1268,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1323,9 +1343,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1| ; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1404,9 +1422,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1485,9 +1501,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1551,9 +1565,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; CI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -2424,7 +2436,6 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16: @@ -2456,8 +2467,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2738,7 +2748,6 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 ; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: @@ -2782,8 +2791,6 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal ; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: @@ -2826,13 +2833,10 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; CI-NEXT: v_mul_f32_e32 v2, 1.0, v1 -; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 ; CI-NEXT: s_setpc_b64 s[30:31] ; @@ -2878,18 +2882,18 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v6f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v6f16: @@ -2933,22 +2937,22 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v8f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v8f16: @@ -3001,30 +3005,30 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v12f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v12f16: @@ -3087,38 +3091,38 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 { ; CI-LABEL: v_test_canonicalize_var_v16f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_var_v16f16: @@ -3216,68 +3220,68 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; CI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; CI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; CI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; CI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; CI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; CI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -3456,228 +3460,354 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104 +; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v10 ; CI-NEXT: v_cvt_f16_f32_e32 v9, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v16 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_or_b32_e32 v3, v4, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v4, v8 ; CI-NEXT: v_cvt_f16_f32_e32 v8, v14 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v21 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v26 +; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32 +; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; CI-NEXT: v_or_b32_e32 v4, v5, v4 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 ; CI-NEXT: v_cvt_f16_f32_e32 v6, v12 ; CI-NEXT: v_or_b32_e32 v5, v7, v5 ; CI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v22 ; CI-NEXT: v_or_b32_e32 v6, v7, v6 ; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v8, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v19 ; CI-NEXT: v_or_b32_e32 v7, v9, v7 -; CI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; CI-NEXT: v_cvt_f16_f32_e32 v15, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v9, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v8 +; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v18 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112 +; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116 +; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 -; CI-NEXT: v_or_b32_e32 v8, v9, v8 +; CI-NEXT: v_or_b32_e32 v8, v10, v8 +; CI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v9 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10 -; CI-NEXT: v_cvt_f16_f32_e32 v10, v20 ; CI-NEXT: v_or_b32_e32 v9, v11, v9 -; CI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4 -; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12 -; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8 -; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; CI-NEXT: v_or_b32_e32 v10, v11, v10 -; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v12, v24 +; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v22 +; CI-NEXT: v_or_b32_e32 v10, v12, v10 +; CI-NEXT: v_cvt_f16_f32_e32 v12, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v11 +; CI-NEXT: v_cvt_f32_f16_e32 v11, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v13, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v30 +; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; CI-NEXT: v_or_b32_e32 v11, v13, v11 -; CI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20 -; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:16 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24 -; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; CI-NEXT: v_cvt_f16_f32_e32 v24, v30 -; CI-NEXT: v_or_b32_e32 v12, v13, v12 -; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; CI-NEXT: v_or_b32_e32 v13, v15, v13 -; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v13, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v12 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v29 +; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 +; CI-NEXT: v_or_b32_e32 v12, v15, v12 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v15, v31 +; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v17 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 +; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v15 ; CI-NEXT: v_cvt_f16_f32_e32 v15, v27 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v21, v33 +; CI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; CI-NEXT: v_or_b32_e32 v13, v16, v13 +; CI-NEXT: v_cvt_f16_f32_e32 v16, v32 +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12 ; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; CI-NEXT: v_or_b32_e32 v14, v15, v14 -; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v24 +; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v22 ; CI-NEXT: v_or_b32_e32 v15, v25, v15 -; CI-NEXT: s_waitcnt vmcnt(11) -; CI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; CI-NEXT: s_waitcnt vmcnt(10) -; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v21 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 +; CI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64 +; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v16 +; CI-NEXT: v_or_b32_e32 v16, v24, v25 +; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v27 +; CI-NEXT: v_or_b32_e32 v25, v28, v24 ; CI-NEXT: s_waitcnt vmcnt(9) ; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; CI-NEXT: v_or_b32_e32 v16, v17, v16 -; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v18 -; CI-NEXT: v_or_b32_e32 v17, v19, v17 ; CI-NEXT: s_waitcnt vmcnt(7) -; CI-NEXT: v_cvt_f16_f32_e32 v18, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; CI-NEXT: v_or_b32_e32 v20, v19, v20 +; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; CI-NEXT: s_waitcnt vmcnt(6) -; CI-NEXT: v_cvt_f16_f32_e32 v19, v21 -; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v34 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; CI-NEXT: v_or_b32_e32 v17, v17, v26 +; CI-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0 +; CI-NEXT: v_or_b32_e32 v18, v27, v18 +; CI-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0 +; CI-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x74, v0 +; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x70, v0 +; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: v_cvt_f16_f32_e32 v20, v22 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v23 -; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; CI-NEXT: v_or_b32_e32 v18, v19, v18 -; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v20 -; CI-NEXT: v_or_b32_e32 v19, v21, v19 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v20, v26 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v21, v27 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v29 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88 +; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92 +; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 +; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84 +; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72 +; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; CI-NEXT: s_waitcnt vmcnt(12) +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; CI-NEXT: v_cvt_f32_f16_e32 v29, v29 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 ; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; CI-NEXT: v_or_b32_e32 v20, v21, v20 -; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v26 -; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52 -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:48 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 -; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56 -; CI-NEXT: v_or_b32_e32 v21, v27, v21 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128 -; CI-NEXT: s_waitcnt vmcnt(5) -; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; CI-NEXT: s_waitcnt vmcnt(4) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_add_i32_e32 v21, vcc, 0x6c, v0 +; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; CI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; CI-NEXT: s_waitcnt vmcnt(13) +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; CI-NEXT: s_waitcnt vmcnt(12) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v24 +; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16 +; CI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; CI-NEXT: s_waitcnt vmcnt(1) +; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; CI-NEXT: v_or_b32_e32 v20, v23, v20 +; CI-NEXT: s_waitcnt vmcnt(9) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: s_waitcnt vmcnt(8) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v28 +; CI-NEXT: s_waitcnt vmcnt(7) +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: s_waitcnt vmcnt(6) +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: s_waitcnt vmcnt(4) ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; CI-NEXT: v_or_b32_e32 v24, v25, v24 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x7c, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; CI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; CI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_or_b32_e32 v22, v22, v23 -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88 -; CI-NEXT: s_waitcnt vmcnt(2) +; CI-NEXT: v_or_b32_e32 v23, v27, v23 +; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 +; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen +; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36 +; CI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; CI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_or_b32_e32 v17, v17, v18 +; CI-NEXT: v_add_i32_e32 v18, vcc, 0x64, v0 +; CI-NEXT: v_or_b32_e32 v25, v25, v26 +; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0 +; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; CI-NEXT: v_or_b32_e32 v19, v24, v19 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_or_b32_e32 v21, v22, v21 +; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40 +; CI-NEXT: s_waitcnt vmcnt(5) +; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: s_waitcnt vmcnt(4) +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; CI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v22 +; CI-NEXT: v_or_b32_e32 v22, v23, v27 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52 +; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; CI-NEXT: v_or_b32_e32 v23, v28, v23 +; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48 +; CI-NEXT: s_waitcnt vmcnt(2) ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x74, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v26 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x70, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 -; CI-NEXT: s_waitcnt vmcnt(3) +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; CI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; CI-NEXT: v_or_b32_e32 v23, v23, v27 -; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; CI-NEXT: v_or_b32_e32 v24, v24, v27 +; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x6c, v0 -; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68 -; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 -; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen -; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76 -; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72 -; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84 -; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80 -; CI-NEXT: s_waitcnt vmcnt(3) -; CI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; CI-NEXT: v_or_b32_e32 v25, v26, v25 -; CI-NEXT: s_waitcnt vmcnt(2) -; CI-NEXT: v_cvt_f16_f32_e32 v26, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; CI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; CI-NEXT: v_or_b32_e32 v27, v28, v27 +; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v27, v29 -; CI-NEXT: v_or_b32_e32 v23, v26, v23 -; CI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; CI-NEXT: v_or_b32_e32 v26, v27, v26 -; CI-NEXT: v_add_i32_e32 v27, vcc, 0x64, v0 -; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v26, vcc, 0x60, v0 -; CI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v23, vcc, 0x5c, v0 -; CI-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v23, vcc, 0x58, v0 -; CI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x54, v0 -; CI-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0 -; CI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v21, vcc, 0x4c, v0 -; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 -; CI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0 -; CI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; CI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 -; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; CI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; CI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; CI-NEXT: v_or_b32_e32 v28, v29, v28 +; CI-NEXT: buffer_store_dword v28, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0 +; CI-NEXT: buffer_store_dword v27, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0 +; CI-NEXT: buffer_store_dword v24, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 +; CI-NEXT: buffer_store_dword v23, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0 +; CI-NEXT: buffer_store_dword v22, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0 +; CI-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 0x44, v0 +; CI-NEXT: buffer_store_dword v19, v17, s[0:3], 0 offen +; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 ; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen ; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index c1093a1..d53c041 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -2389,7 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2471,15 +2470,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX6-NEXT: flat_load_dword v0, v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX6-NEXT: flat_store_dword v[0:1], v4 @@ -2724,7 +2721,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) % ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -2807,15 +2803,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX6-NEXT: flat_load_dword v0, v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v3, s3 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0 -; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: v_or_b32_e32 v4, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_or_b32_e32 v4, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX6-NEXT: flat_store_dword v[0:1], v4 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 78fb89c..b32630a 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -951,8 +951,6 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1056,7 +1054,6 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1110,7 +1107,6 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1193,7 +1189,6 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1222,7 +1217,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1253,7 +1247,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1311,7 +1304,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1346,7 +1338,6 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: v_mul_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1413,8 +1404,6 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1494,8 +1483,6 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1599,7 +1586,6 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1653,7 +1639,6 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1736,7 +1721,6 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1792,7 +1776,6 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 { ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0, v0 ; SI-NEXT: v_mul_f32_e64 v0, -v0, v1 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1859,8 +1842,6 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b) ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1 ; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3980,7 +3961,8 @@ define half @v_fneg_canonicalize_f16(half %a) #0 { ; SI-LABEL: v_fneg_canonicalize_f16: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fneg_canonicalize_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index 17f6761..b5440b9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -1021,7 +1021,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1043,7 +1042,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 { ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0 ; SI-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir index 3616d61..5ef8a94 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir @@ -8,6 +8,8 @@ --- name: restore_undef_copy_use tracksRegLiveness: true +frameInfo: + adjustsStack: true machineFunctionInfo: maxKernArgAlign: 1 isEntryFunction: true diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 6fa7df9..18d2e52 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -618,16 +618,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isinf_pattern_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0x7f800000 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e64 v0, |s0| -; SI-NEXT: v_cmp_eq_f32_e32 vcc, s1, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: s_cmpk_eq_i32 s4, 0x7c00 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isinf_pattern_f16: @@ -667,16 +667,19 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_0_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s1, 0x1f8 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0_f16: @@ -718,16 +721,19 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 { ; SI-LABEL: test_isfinite_pattern_4_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xb -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s1, 0x1f8 +; SI-NEXT: s_load_dword s4, s[0:1], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x7fff +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: s_cmpk_lt_i32 s4, 0x7c00 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_4_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll index 767d347..a948fab 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -1181,18 +1181,28 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB42_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB42_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1200,20 +1210,30 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB42_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB42_2 +; GFX90A-NEXT: .LBB42_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB42_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB42_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst @@ -1223,26 +1243,45 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB43_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB43_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB43_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB43_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1252,18 +1291,28 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB44_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB44_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol @@ -1271,20 +1320,30 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB44_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB44_2 +; GFX90A-NEXT: .LBB44_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB44_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: .LBB44_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst @@ -1294,26 +1353,45 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB45_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB45_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB45_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB45_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -1485,37 +1563,57 @@ main_body: define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) { ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB52_3 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start +; GFX90A-NEXT: .LBB52_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz .LBB52_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB52_2 +; GFX90A-NEXT: .LBB52_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB52_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: buffer_wbl2 sc1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: .LBB52_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst @@ -2020,23 +2118,42 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB70_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB70_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB70_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB70_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2046,23 +2163,42 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB71_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: .LBB71_2: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB71_2 +; GFX940-NEXT: ; %bb.1: ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 -; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v2, s0 ; GFX940-NEXT: ds_add_f64 v2, v[0:1] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: .LBB71_2: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst @@ -2072,46 +2208,66 @@ main_body: define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 { ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX90A: ; %bb.0: ; %main_body -; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB72_3 +; GFX90A-NEXT: ; %bb.1: +; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: ds_read_b64 v[0:1], v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, s2 -; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GFX90A-NEXT: ds_read_b64 v[2:3], v0 +; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX90A-NEXT: s_mov_b64 s[0:1], 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, s4 +; GFX90A-NEXT: .LBB72_2: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_add_f64 v[4:5], v[0:1], 4.0 -; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5] +; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] +; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1] +; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1] +; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execnz .LBB72_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT: s_cbranch_execnz .LBB72_2 +; GFX90A-NEXT: .LBB72_3: ; GFX90A-NEXT: s_endpgm ; ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: ; GFX940: ; %bb.0: ; %main_body -; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_mov_b64 s[2:3], exec +; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX940-NEXT: s_cbranch_execz .LBB72_3 +; GFX940-NEXT: ; %bb.1: +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: ds_read_b64 v[0:1], v0 -; GFX940-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: ds_read_b64 v[2:3], v0 +; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: v_mov_b32_e32 v4, s4 +; GFX940-NEXT: .LBB72_2: ; %atomicrmw.start ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_add_f64 v[4:5], v[0:1], 4.0 -; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5] +; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1] +; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7] ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1] +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5] +; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7] ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB72_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_cbranch_execnz .LBB72_2 +; GFX940-NEXT: .LBB72_3: ; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll new file mode 100644 index 0000000..66bf0d5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -0,0 +1,1502 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s + +define i128 @fptosi_f64_to_i128(double %x) { +; SDAG-LABEL: fptosi_f64_to_i128: +; SDAG: ; %bb.0: ; %fp-to-i-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11 +; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SDAG-NEXT: s_cbranch_execz .LBB0_10 +; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc +; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc +; SDAG-NEXT: s_mov_b32 s5, -1 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB0_7 +; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] +; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB0_4 +; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else +; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 +; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 +; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 +; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3] +; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 +; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 +; SDAG-NEXT: v_mov_b32_e32 v2, v3 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr11 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: .LBB0_4: ; %Flow +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB0_6 +; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v7, v4 +; SDAG-NEXT: v_mov_b32_e32 v4, v2 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: .LBB0_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB0_7: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 +; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: ; %bb.9: ; %Flow3 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fptosi_f64_to_i128: +; GISEL: ; %bb.0: ; %fp-to-i-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 +; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB0_10 +; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_mov_b32_e32 v3, -1 +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB0_7 +; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1 +; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x433 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB0_4 +; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else +; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: .LBB0_4: ; %Flow +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] +; GISEL-NEXT: s_cbranch_execz .LBB0_6 +; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 +; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, 0 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v6, v5, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: .LBB0_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB0_7: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] +; GISEL-NEXT: s_cbranch_execz .LBB0_9 +; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 +; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 +; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 +; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-NEXT: .LBB0_9: ; %Flow3 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB0_10: ; %fp-to-i-cleanup +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = fptosi double %x to i128 + ret i128 %cvt +} + +define i128 @fptoui_f64_to_i128(double %x) { +; SDAG-LABEL: fptoui_f64_to_i128: +; SDAG: ; %bb.0: ; %fp-to-i-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11 +; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SDAG-NEXT: s_cbranch_execz .LBB1_10 +; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc +; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc +; SDAG-NEXT: s_mov_b32 s5, -1 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB1_7 +; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5] +; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB1_4 +; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else +; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6 +; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6 +; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, v1 +; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3] +; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 +; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13 +; SDAG-NEXT: v_mov_b32_e32 v6, v2 +; SDAG-NEXT: v_mov_b32_e32 v2, v3 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr11 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5] +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: .LBB1_4: ; %Flow +; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB1_6 +; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 +; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v7, v4 +; SDAG-NEXT: v_mov_b32_e32 v4, v2 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4] +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4] +; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 +; SDAG-NEXT: .LBB1_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB1_7: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 +; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: ; %bb.9: ; %Flow3 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fptoui_f64_to_i128: +; GISEL: ; %bb.0: ; %fp-to-i-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5 +; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, 0 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB1_10 +; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_mov_b32_e32 v3, -1 +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB1_7 +; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v20 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v20 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1 +; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x433 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB1_4 +; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else +; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: .LBB1_4: ; %Flow +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17] +; GISEL-NEXT: s_cbranch_execz .LBB1_6 +; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6 +; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, 0 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3] +; GISEL-NEXT: v_mul_lo_u32 v6, v5, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: .LBB1_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: .LBB1_7: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] +; GISEL-NEXT: s_cbranch_execz .LBB1_9 +; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 +; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 +; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 +; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-NEXT: .LBB1_9: ; %Flow3 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB1_10: ; %fp-to-i-cleanup +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = fptoui double %x to i128 + ret i128 %cvt +} + +define i128 @fptosi_f32_to_i128(float %x) { +; SDAG-LABEL: fptosi_f32_to_i128: +; SDAG: ; %bb.0: ; %fp-to-i-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 +; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SDAG-NEXT: s_cbranch_execz .LBB2_10 +; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: s_mov_b32 s5, -1 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB2_7 +; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] +; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB2_4 +; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else +; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 +; SDAG-NEXT: v_mov_b32_e32 v6, v1 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v5 +; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: .LBB2_4: ; %Flow +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB2_6 +; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: .LBB2_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB2_7: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 +; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: ; %bb.9: ; %Flow3 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fptosi_f32_to_i128: +; GISEL: ; %bb.0: ; %fp-to-i-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5] +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB2_10 +; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_mov_b32_e32 v3, -1 +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB2_7 +; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v9, v1, v2, 1 +; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB2_4 +; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else +; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: .LBB2_4: ; %Flow +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] +; GISEL-NEXT: s_cbranch_execz .LBB2_6 +; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 +; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0 +; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: .LBB2_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB2_7: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] +; GISEL-NEXT: s_cbranch_execz .LBB2_9 +; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 +; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 +; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 +; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-NEXT: .LBB2_9: ; %Flow3 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB2_10: ; %fp-to-i-cleanup +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = fptosi float %x to i128 + ret i128 %cvt +} + +define i128 @fptoui_f32_to_i128(float %x) { +; SDAG-LABEL: fptoui_f32_to_i128: +; SDAG: ; %bb.0: ; %fp-to-i-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8 +; SDAG-NEXT: s_movk_i32 s4, 0x7e +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, 0 +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc +; SDAG-NEXT: s_cbranch_execz .LBB3_10 +; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc +; SDAG-NEXT: s_movk_i32 s4, 0xff7f +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: s_mov_b32 s5, -1 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3] +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 +; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB3_7 +; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 +; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5] +; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: v_mov_b32_e32 v7, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] +; SDAG-NEXT: s_cbranch_execz .LBB3_4 +; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else +; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5 +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5 +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] +; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0 +; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2 +; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3 +; SDAG-NEXT: v_mov_b32_e32 v6, v1 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mov_b32_e32 v6, v5 +; SDAG-NEXT: v_mov_b32_e32 v5, v7 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5] +; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14 +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3] +; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5 +; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12 +; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13 +; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6] +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3 +; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5] +; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: .LBB3_4: ; %Flow +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB3_6 +; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] +; SDAG-NEXT: v_mov_b32_e32 v1, v4 +; SDAG-NEXT: .LBB3_6: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: .LBB3_7: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 +; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_mov_b32_e32 v0, v2 +; SDAG-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-NEXT: ; %bb.9: ; %Flow3 +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: fptoui_f32_to_i128: +; GISEL: ; %bb.0: ; %fp-to-i-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5] +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_mov_b32_e32 v7, v5 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB3_10 +; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80 +; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc +; GISEL-NEXT: v_mov_b32_e32 v3, -1 +; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9] +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB3_7 +; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9 +; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7] +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0 +; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v2 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v3 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v5 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v8 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v8 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v9 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v9 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v10 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v10 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v11 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v11 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v12 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v12 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v13 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v13 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v14 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v14 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v15 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v15 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v16 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v16 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v17 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v17 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v18 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v18 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v19 +; GISEL-NEXT: v_or_b32_e32 v1, v1, v19 +; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0 +; GISEL-NEXT: v_or3_b32 v9, v1, v2, 1 +; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x96 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1] +; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2 +; GISEL-NEXT: v_mov_b32_e32 v5, 0 +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] +; GISEL-NEXT: s_cbranch_execz .LBB3_4 +; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else +; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 +; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7] +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0 +; GISEL-NEXT: v_mov_b32_e32 v2, v6 +; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2] +; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11] +; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9] +; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7] +; GISEL-NEXT: ; implicit-def: $vgpr6 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: .LBB3_4: ; %Flow +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17] +; GISEL-NEXT: s_cbranch_execz .LBB3_6 +; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12 +; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6 +; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc +; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0 +; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0 +; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10 +; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2] +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GISEL-NEXT: .LBB3_6: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB3_7: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15] +; GISEL-NEXT: s_cbranch_execz .LBB3_9 +; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5 +; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v1, 1, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v2, v1, v2 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14 +; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14 +; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16 +; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16 +; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18 +; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18 +; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20 +; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20 +; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 +; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4 +; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6 +; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8 +; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8 +; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10 +; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10 +; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12 +; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1 +; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12 +; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1 +; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1 +; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1 +; GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GISEL-NEXT: .LBB3_9: ; %Flow3 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: .LBB3_10: ; %fp-to-i-cleanup +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = fptoui float %x to i128 + ret i128 %cvt +} + +define i128 @fptosi_f16_to_i128(half %x) { +; GCN-LABEL: fptosi_f16_to_i128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: v_mov_b32_e32 v2, v1 +; GCN-NEXT: v_mov_b32_e32 v3, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %cvt = fptosi half %x to i128 + ret i128 %cvt +} + +define i128 @fptoui_f16_to_i128(half %x) { +; GCN-LABEL: fptoui_f16_to_i128: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %cvt = fptoui half %x to i128 + ret i128 %cvt +} + +; FIXME: ExpandLargeFpConvert asserts on bfloat +; define i128 @fptosi_bf16_to_i128(bfloat %x) { +; %cvt = fptosi bfloat %x to i128 +; ret i128 %cvt +; } + +; define i128 @fptoui_bf16_to_i128(bfloat %x) { +; %cvt = fptoui bfloat %x to i128 +; ret i128 %cvt +; } diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index 3a0b825..e361aa4 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -1705,16 +1705,16 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX6-NEXT: v_min_f32_e32 v7, 0x3f7fffff, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc ; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX6-NEXT: s_movk_i32 s10, 0x204 +; GFX6-NEXT: v_mov_b32_e32 v8, 0x204 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc -; GFX6-NEXT: v_cmp_class_f32_e64 s[8:9], v0, s10 +; GFX6-NEXT: v_cmp_class_f32_e32 vcc, v0, v8 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v7, 0, s[8:9] -; GFX6-NEXT: v_cmp_class_f32_e64 s[8:9], v1, s10 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v7, 0, vcc +; GFX6-NEXT: v_cmp_class_f32_e32 vcc, v1, v8 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v6, 0, s[8:9] +; GFX6-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1722,19 +1722,19 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX7-LABEL: safe_math_fract_v2f32: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX7-NEXT: v_mov_b32_e32 v8, 0x204 ; GFX7-NEXT: v_fract_f32_e32 v6, v0 -; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8 +; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v8 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_floor_f32_e32 v4, v0 ; GFX7-NEXT: v_fract_f32_e32 v7, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v1|, s8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc +; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v1, v8 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 ; GFX7-NEXT: v_floor_f32_e32 v5, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v7, 0, vcc ; GFX7-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1742,15 +1742,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX8-LABEL: safe_math_fract_v2f32: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX8-NEXT: v_mov_b32_e32 v8, 0x204 ; GFX8-NEXT: v_fract_f32_e32 v6, v0 -; GFX8-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4 +; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v8 ; GFX8-NEXT: v_floor_f32_e32 v4, v0 ; GFX8-NEXT: v_fract_f32_e32 v7, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX8-NEXT: v_cmp_neq_f32_e64 vcc, |v1|, s4 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc +; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v1, v8 ; GFX8-NEXT: v_floor_f32_e32 v5, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, 0, vcc ; GFX8-NEXT: global_store_dwordx2 v[2:3], v[4:5], off ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1759,14 +1759,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_fract_f32_e32 v6, v0 -; GFX11-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| +; GFX11-NEXT: v_cmp_class_f32_e64 s0, v0, 0x204 ; GFX11-NEXT: v_fract_f32_e32 v7, v1 ; GFX11-NEXT: v_floor_f32_e32 v4, v0 ; GFX11-NEXT: v_floor_f32_e32 v5, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo -; GFX11-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v1| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 +; GFX11-NEXT: v_cmp_class_f32_e64 s0, v1, 0x204 ; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x) @@ -1937,21 +1938,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX6: ; %bb.0: ; %entry ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX6-NEXT: s_movk_i32 s8, 0x7c00 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: v_floor_f32_e32 v3, v0 -; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX6-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: v_floor_f32_e32 v4, v3 +; GFX6-NEXT: v_sub_f32_e32 v5, v3, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX6-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1959,21 +1961,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly % ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX7-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_floor_f32_e32 v3, v0 -; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 -; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4 -; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc -; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: v_floor_f32_e32 v4, v3 +; GFX7-NEXT: v_sub_f32_e32 v5, v3, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5 +; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc +; GFX7-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2062,12 +2065,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX6-NEXT: s_movk_i32 s8, 0x7c00 ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX6-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX6-NEXT: v_floor_f32_e32 v6, v4 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX6-NEXT: v_floor_f32_e32 v8, v5 @@ -2080,10 +2083,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX6-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX6-NEXT: v_cmp_neq_f32_e32 vcc, s8, v0 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc -; GFX6-NEXT: v_cmp_neq_f32_e32 vcc, s8, v1 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v1 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 @@ -2098,12 +2101,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: s_mov_b32 s8, 0x7f800000 +; GFX7-NEXT: s_movk_i32 s8, 0x7c00 ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 -; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0| -; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1| +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1 ; GFX7-NEXT: v_floor_f32_e32 v6, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6 ; GFX7-NEXT: v_floor_f32_e32 v8, v5 @@ -2116,10 +2119,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX7-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, s8, v0 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc -; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, s8, v1 +; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v1 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 @@ -2133,16 +2136,16 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: s_movk_i32 s6, 0x204 +; GFX8-NEXT: v_mov_b32_e32 v7, 0x204 ; GFX8-NEXT: v_floor_f16_e32 v4, v3 ; GFX8-NEXT: v_floor_f16_e32 v5, v0 ; GFX8-NEXT: v_fract_f16_e32 v6, v3 -; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v3, s6 +; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v3, v7 ; GFX8-NEXT: v_pack_b32_f16 v4, v5, v4 ; GFX8-NEXT: v_fract_f16_e32 v5, v0 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, 0, s[4:5] -; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, 0, vcc +; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v0, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc ; GFX8-NEXT: v_pack_b32_f16 v0, v0, v3 ; GFX8-NEXT: global_store_dword v[1:2], v4, off ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2237,19 +2240,19 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc ; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc ; GFX6-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GFX6-NEXT: s_movk_i32 s10, 0x204 -; GFX6-NEXT: v_cmp_class_f64_e64 s[8:9], v[0:1], s10 +; GFX6-NEXT: v_mov_b32_e32 v14, 0x204 ; GFX6-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v12, 0, s[8:9] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v13, 0, s[8:9] -; GFX6-NEXT: v_cmp_class_f64_e64 s[8:9], v[2:3], s10 +; GFX6-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v14 ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v1, v13, 0, vcc +; GFX6-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v14 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 ; GFX6-NEXT: s_mov_b32 s5, s6 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v10, 0, s[8:9] -; GFX6-NEXT: v_cndmask_b32_e64 v3, v11, 0, s[8:9] +; GFX6-NEXT: v_cndmask_b32_e64 v2, v10, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, v11, 0, vcc ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -2257,39 +2260,39 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc ; GFX7-LABEL: safe_math_fract_v2f64: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_movk_i32 s4, 0x204 +; GFX7-NEXT: v_mov_b32_e32 v6, 0x204 ; GFX7-NEXT: v_fract_f64_e32 v[10:11], v[0:1] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[0:1], s4 +; GFX7-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v6 ; GFX7-NEXT: v_fract_f64_e32 v[12:13], v[2:3] -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[2:3], s4 +; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v6 ; GFX7-NEXT: v_floor_f64_e32 v[8:9], v[2:3] ; GFX7-NEXT: v_floor_f64_e32 v[6:7], v[0:1] -; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s4, s6 -; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v10, 0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v11, 0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[10:11] -; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64 +; GFX7-NEXT: s_mov_b32 s10, 0 +; GFX7-NEXT: s_mov_b32 s11, 0xf000 +; GFX7-NEXT: s_mov_b32 s8, s10 +; GFX7-NEXT: s_mov_b32 s9, s10 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v10, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v11, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[4:5] +; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[8:11], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: safe_math_fract_v2f64: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s6, 0x204 +; GFX8-NEXT: v_mov_b32_e32 v6, 0x204 ; GFX8-NEXT: v_fract_f64_e32 v[10:11], v[0:1] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], s6 +; GFX8-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v6 ; GFX8-NEXT: v_fract_f64_e32 v[12:13], v[2:3] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[2:3], s6 +; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v6 ; GFX8-NEXT: v_floor_f64_e32 v[8:9], v[2:3] ; GFX8-NEXT: v_floor_f64_e32 v[6:7], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v10, 0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v11, 0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v10, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v11, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[4:5] ; GFX8-NEXT: global_store_dwordx4 v[4:5], v[6:9], off ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll index e3fada3..b717280 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll @@ -1,71 +1,43 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer<strategy=iterative>,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer<strategy=dpp>,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-DPP %s + +; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations. +; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction +; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes. define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] -; IR-ITERATIVE: 2: -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) -; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float -; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] -; IR-ITERATIVE: 14: -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP16]] -; IR-ITERATIVE: 16: -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float -; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] -; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] -; IR-ITERATIVE-NEXT: br label [[TMP24]] -; IR-ITERATIVE: 24: -; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP25]] -; -; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] -; IR-DPP: 2: -; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) -; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 -; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] -; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] -; IR-DPP: 14: -; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP16]] -; IR-DPP: 16: -; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 -; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) -; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float -; IR-DPP-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] -; IR-DPP-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] -; IR-DPP-NEXT: br label [[TMP24]] -; IR-DPP: 24: -; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] -; IR-DPP-NEXT: ret float [[TMP25]] +; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR: 14: +; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP16]] +; IR: 16: +; IR-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32 +; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]]) +; IR-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float +; IR-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float +; IR-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]] +; IR-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]] +; IR-NEXT: br label [[TMP24]] +; IR: 24: +; IR-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ] +; IR-NEXT: ret float [[TMP25]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result @@ -411,7 +383,6 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str ret float %result } - define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] @@ -514,61 +485,33 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str } define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]] -; IR-ITERATIVE: 2: -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP12]] -; IR-ITERATIVE: 12: -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) -; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]] -; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]]) -; IR-ITERATIVE-NEXT: br label [[TMP20]] -; IR-ITERATIVE: 20: -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ] -; IR-ITERATIVE-NEXT: ret float [[TMP21]] -; -; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]] -; IR-DPP: 2: -; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR-DPP: 10: -; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP12]] -; IR-DPP: 12: -; IR-DPP-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] -; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 -; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) -; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float -; IR-DPP-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]] -; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]]) -; IR-DPP-NEXT: br label [[TMP20]] -; IR-DPP: 20: -; IR-DPP-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ] -; IR-DPP-NEXT: ret float [[TMP21]] +; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32 +; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]]) +; IR-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float +; IR-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float +; IR-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]] +; IR-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]]) +; IR-NEXT: br label [[TMP20]] +; IR: 20: +; IR-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ] +; IR-NEXT: ret float [[TMP21]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result @@ -1007,164 +950,674 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st ret float %result } - define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret float %result } define amdgpu_ps float @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } - define amdgpu_ps float @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{ -; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{ -; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret float %result } define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-ITERATIVE-NEXT: ret float [[RESULT]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 -; IR-DPP-NEXT: ret float [[RESULT]] +; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret float [[RESULT]] ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 ret float %result } +define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to double +; IR-NEXT: [[TMP12:%.*]] = fmul double [[VAL:%.*]], [[TMP11]] +; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR: 14: +; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP16]] +; IR: 16: +; IR-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) +; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) +; IR-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP8]] to double +; IR-NEXT: [[TMP28:%.*]] = fmul double [[VAL]], [[TMP27]] +; IR-NEXT: [[TMP29:%.*]] = fadd double [[TMP26]], [[TMP28]] +; IR-NEXT: br label [[TMP30]] +; IR: 30: +; IR-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-NEXT: ret double [[TMP31]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP30]] +; IR-ITERATIVE: 30: +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP31]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP30]] +; IR-DPP: 30: +; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-DPP-NEXT: ret double [[TMP31]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP30]] +; IR-ITERATIVE: 30: +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP31]] +; +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP30]] +; IR-DPP: 30: +; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-DPP-NEXT: ret double [[TMP31]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] +; IR-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64 +; IR-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 +; IR-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 +; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) +; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) +; IR-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; IR-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 +; IR-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double +; IR-NEXT: [[TMP23:%.*]] = uitofp i32 [[TMP8]] to double +; IR-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0x7FF0000000000000, double [[VAL]] +; IR-NEXT: [[TMP25:%.*]] = call double @llvm.minnum.f64(double [[TMP22]], double [[TMP24]]) +; IR-NEXT: br label [[TMP26]] +; IR: 26: +; IR-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ] +; IR-NEXT: ret double [[TMP27]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64 +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]] +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP26]] +; IR-ITERATIVE: 26: +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP27]] +; +; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR-DPP: 10: +; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP12]] +; IR-DPP: 12: +; IR-DPP-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ] +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64 +; IR-DPP-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; IR-DPP-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32 +; IR-DPP-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 +; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0 +; IR-DPP-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1 +; IR-DPP-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double +; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]] +; IR-DPP-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP26]] +; IR-DPP: 26: +; IR-DPP-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ] +; IR-DPP-NEXT: ret double [[TMP27]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1{ +; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: br label [[TMP30]] +; IR-ITERATIVE: 30: +; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-ITERATIVE-NEXT: ret double [[TMP31]] +; +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ] +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64 +; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 +; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 +; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0 +; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1 +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double +; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: br label [[TMP30]] +; IR-DPP: 30: +; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ] +; IR-DPP-NEXT: ret double [[TMP31]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fmin_double_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_div_address_uni_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fmin_double_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_div_address_div_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1{ +; IR-LABEL: @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1{ +; IR-LABEL: @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret double %result +} + +define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret double [[RESULT]] +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret double %result +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 76ec1cc..99d02ff 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -358,65 +358,6 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1) ; --------------------------------------------------------------------- define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) { -; GCN1-LABEL: global_atomic_xchg_f32_noret: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB0_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_noret: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB0_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_noret: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v3, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB0_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -450,69 +391,6 @@ define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) { } define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %in) { -; GCN1-LABEL: global_atomic_xchg_f32_noret_offset: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_f32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: global_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB1_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_noret_offset: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: global_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB1_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_noret_offset: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB1_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -549,71 +427,6 @@ define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %i } define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) { -; GCN1-LABEL: global_atomic_xchg_f32_ret: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB2_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v0, v4 -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_ret: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB2_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v0, v4 -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_ret: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v4, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB2_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -648,73 +461,6 @@ define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) { } define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in) { -; GCN1-LABEL: global_atomic_xchg_f32_ret_offset: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_f32_e32 v4, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: global_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v0 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB3_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_ret_offset: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: global_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v0 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB3_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_ret_offset: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v4, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB3_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -752,80 +498,6 @@ define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in } define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f32_noret_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: global_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB4_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_noret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: global_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB4_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_noret_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v1, v[0:1] -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB4_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_noret_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -876,84 +548,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr } define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f32_noret_offset_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: global_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB5_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_noret_offset_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: global_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB5_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_noret_offset_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB5_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1007,83 +601,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace } define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f32_ret_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: global_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s6 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB6_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_ret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: global_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s6 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB6_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_ret_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v0, v[0:1] -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s6 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB6_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_ret_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1134,87 +651,6 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inre } define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f32_ret_offset_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: global_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s6 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB7_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f32_ret_offset_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: global_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s6 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB7_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f32_ret_offset_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s6 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB7_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index d137f47..380ce7f 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -372,65 +372,6 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1) ; --------------------------------------------------------------------- define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) { -; GCN1-LABEL: global_atomic_xchg_f64_noret: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB0_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_noret: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB0_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_noret: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v3, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB0_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -464,69 +405,6 @@ define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) { } define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %in) { -; GCN1-LABEL: global_atomic_xchg_f64_noret_offset: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_f64_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: global_load_dword v3, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB1_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_noret_offset: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: global_load_dword v3, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB1_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_noret_offset: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v3, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB1_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -563,71 +441,6 @@ define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double % } define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) { -; GCN1-LABEL: global_atomic_xchg_f64_ret: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_load_dword v4, v[0:1] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v4 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB2_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: v_mov_b32_e32 v0, v4 -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_ret: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_load_dword v4, v[0:1] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v4 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB2_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: v_mov_b32_e32 v0, v4 -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_ret: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v4, v[0:1] -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB2_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -663,73 +476,6 @@ define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) { } define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %in) { -; GCN1-LABEL: global_atomic_xchg_f64_ret_offset: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_f64_e32 v4, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN1-NEXT: global_load_dword v0, v[4:5] -; GCN1-NEXT: s_mov_b64 s[4:5], 0 -; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v3, v0 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB3_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_ret_offset: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GCN2-NEXT: global_load_dword v0, v[4:5] -; GCN2-NEXT: s_mov_b64 s[4:5], 0 -; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v3, v0 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 -; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB3_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_ret_offset: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_load_dword v4, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[4:5], 0 -; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v3, v4 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3 -; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB3_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN3-NEXT: v_mov_b32_e32 v0, v4 -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -768,80 +514,6 @@ define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double % } define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f64_noret_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: global_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s4 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB4_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_noret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: global_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s4 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s5 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB4_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_noret_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v1, v[0:1] -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB4_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_noret_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -896,84 +568,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr } define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f64_noret_offset_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: global_load_dword v1, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v2, s34 -; GCN1-NEXT: v_mov_b32_e32 v0, s6 -; GCN1-NEXT: v_mov_b32_e32 v3, s35 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB5_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_noret_offset_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: global_load_dword v1, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v2, s34 -; GCN2-NEXT: v_mov_b32_e32 v0, s6 -; GCN2-NEXT: v_mov_b32_e32 v3, s35 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB5_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_noret_offset_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v1, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v2, s4 -; GCN3-NEXT: v_mov_b32_e32 v0, s6 -; GCN3-NEXT: v_mov_b32_e32 v3, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: v_mov_b32_e32 v1, v0 -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB5_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_noret_offset_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1029,83 +623,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace } define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f64_ret_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: global_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[34:35], 0 -; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s4 -; GCN1-NEXT: v_mov_b32_e32 v1, s6 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s5 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_cbranch_execnz .LBB6_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_ret_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: global_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[34:35], 0 -; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s4 -; GCN2-NEXT: v_mov_b32_e32 v1, s6 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s5 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_cbranch_execnz .LBB6_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_ret_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v0, v[0:1] -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s6 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB6_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_ret_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1160,87 +677,6 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inr } define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) { -; GCN1-LABEL: global_atomic_xchg_f64_ret_offset_scalar: -; GCN1: ; %bb.0: -; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s34, s4, 16 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: global_load_dword v0, v[0:1] -; GCN1-NEXT: s_mov_b64 s[36:37], 0 -; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s6 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_mov_b32_e32 v2, v0 -; GCN1-NEXT: v_mov_b32_e32 v4, s35 -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_cbranch_execnz .LBB7_1 -; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; -; GCN2-LABEL: global_atomic_xchg_f64_ret_offset_scalar: -; GCN2: ; %bb.0: -; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s34, s4, 16 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: global_load_dword v0, v[0:1] -; GCN2-NEXT: s_mov_b64 s[36:37], 0 -; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s6 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v2, v0 -; GCN2-NEXT: v_mov_b32_e32 v4, s35 -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc -; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] -; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_cbranch_execnz .LBB7_1 -; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; -; GCN3-LABEL: global_atomic_xchg_f64_ret_offset_scalar: -; GCN3: ; %bb.0: -; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: global_load_dword v0, v[0:1] offset:16 -; GCN3-NEXT: s_mov_b64 s[34:35], 0 -; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start -; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN3-NEXT: v_mov_b32_e32 v3, s4 -; GCN3-NEXT: v_mov_b32_e32 v1, s6 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v2, v0 -; GCN3-NEXT: v_mov_b32_e32 v4, s5 -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc -; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 -; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] -; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_cbranch_execnz .LBB7_1 -; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] -; GCN3-NEXT: s_setpc_b64 s[30:31] ; SI-LABEL: global_atomic_xchg_f64_ret_offset_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll index fab24e1..86e3d93 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s +; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer<strategy=iterative>,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer<strategy=dpp>,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { ; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_value( diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll index f87932b..b9234f4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll @@ -1,55 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s -; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer<strategy=iterative>,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer<strategy=dpp>,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-DPP %s + +; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations. +; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction +; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes. define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] -; IR-ITERATIVE: 2: -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) -; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float -; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] -; IR-ITERATIVE: 14: -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP16]] -; IR-ITERATIVE: 16: -; IR-ITERATIVE-NEXT: br label [[TMP17]] -; IR-ITERATIVE: 17: -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] -; IR-DPP: 2: -; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) -; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 -; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float -; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] -; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] -; IR-DPP: 14: -; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP16]] -; IR-DPP: 16: -; IR-DPP-NEXT: br label [[TMP17]] -; IR-DPP: 17: -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float +; IR-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]] +; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR: 14: +; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP16]] +; IR: 16: +; IR-NEXT: br label [[TMP17]] +; IR: 17: +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret void @@ -325,7 +305,6 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_uni_value_agent_scope_stri ret void } - define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 { ; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp( ; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] @@ -409,45 +388,25 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri } define amdgpu_ps void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] -; IR-ITERATIVE: 2: -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP12]] -; IR-ITERATIVE: 12: -; IR-ITERATIVE-NEXT: br label [[TMP13]] -; IR-ITERATIVE: 13: -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() -; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] -; IR-DPP: 2: -; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 -; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 -; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 -; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) -; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 -; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] -; IR-DPP: 10: -; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: br label [[TMP12]] -; IR-DPP: 12: -; IR-DPP-NEXT: br label [[TMP13]] -; IR-DPP: 13: -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: br label [[TMP13]] +; IR: 13: +; IR-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void @@ -797,161 +756,531 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str ret void } - define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret void } define amdgpu_ps void @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4 ret void } define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret void } define amdgpu_ps void @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic ret void } define amdgpu_ps void @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } - define amdgpu_ps void @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } define amdgpu_ps void @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } define amdgpu_ps void @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 -; IR-DPP-NEXT: ret void +; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void ; %result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic ret void } define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{ -; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{ +; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { +; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { +; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) +; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to double +; IR-NEXT: [[TMP12:%.*]] = fmul double [[VAL:%.*]], [[TMP11]] +; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR: 14: +; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 4 +; IR-NEXT: br label [[TMP16]] +; IR: 16: +; IR-NEXT: br label [[TMP17]] +; IR: 17: +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: ; IR-ITERATIVE-NEXT: ret void ; -; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: ; IR-DPP-NEXT: ret void ; - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic ret void } -define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{ -; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: ; IR-ITERATIVE-NEXT: ret void ; -; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: ; IR-DPP-NEXT: ret void ; - %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void } -define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +define amdgpu_ps void @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() +; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR: 2: +; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) +; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) +; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR: 10: +; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: br label [[TMP12]] +; IR: 12: +; IR-NEXT: br label [[TMP13]] +; IR: 13: +; IR-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{ +; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-ITERATIVE-NEXT: br label [[TMP12]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: br label [[TMP13]] +; IR-ITERATIVE: 13: ; IR-ITERATIVE-NEXT: ret void ; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]] +; IR-DPP: 10: +; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-DPP-NEXT: br label [[TMP12]] +; IR-DPP: 12: +; IR-DPP-NEXT: br label [[TMP13]] +; IR-DPP: 13: ; IR-DPP-NEXT: ret void ; - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic ret void } -define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( -; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1{ +; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]] +; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-ITERATIVE: 2: +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]] +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP16]] +; IR-ITERATIVE: 16: +; IR-ITERATIVE-NEXT: br label [[TMP17]] +; IR-ITERATIVE: 17: ; IR-ITERATIVE-NEXT: ret void ; -; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp( -; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4 +; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]] +; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]] +; IR-DPP: 2: +; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]] +; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]] +; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0 +; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]] +; IR-DPP: 14: +; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP16]] +; IR-DPP: 16: +; IR-DPP-NEXT: br label [[TMP17]] +; IR-DPP: 17: ; IR-DPP-NEXT: ret void ; - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4 + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_double_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, double inreg %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_div_address_uni_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmin_double_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, double %val) #0 { +; IR-LABEL: @global_atomic_fmin_double_div_address_div_value_agent_scope( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1{ +; IR-LABEL: @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1{ +; IR-LABEL: @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_structfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8 +; IR-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 + ret void +} + +define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 { +; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp( +; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4 +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index 96c615b..4f00d48 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -13,6 +13,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() +declare double @div.double.value() define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe: @@ -5408,6 +5409,5583 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop ret void } +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-NEXT: .LBB9_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-NEXT: .LBB9_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-NEXT: .LBB9_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-NEXT: .LBB9_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-NEXT: .LBB9_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: .LBB9_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-NEXT: .LBB11_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-NEXT: .LBB11_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-NEXT: .LBB11_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-NEXT: .LBB11_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-NEXT: .LBB11_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() strictfp + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: .LBB13_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: .LBB13_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() strictfp + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-NEXT: .LBB16_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-NEXT: .LBB16_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-NEXT: .LBB16_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-NEXT: .LBB16_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-NEXT: .LBB16_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-NEXT: .LBB16_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-DPP-NEXT: .LBB16_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-DPP-NEXT: .LBB16_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-DPP-NEXT: .LBB16_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-DPP-NEXT: .LBB16_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-DPP-NEXT: .LBB16_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() strictfp + %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 3cc5a4c..622be43 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -13,6 +13,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() +declare float @div.double.value() define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe: @@ -3550,6 +3551,3965 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop ret void } +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-NEXT: .LBB6_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-NEXT: .LBB6_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: .LBB6_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: .LBB6_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-NEXT: .LBB6_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-NEXT: .LBB6_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: .LBB6_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: .LBB6_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: .LBB6_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: .LBB6_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: .LBB6_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-NEXT: .LBB8_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-NEXT: .LBB8_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-NEXT: .LBB8_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-NEXT: .LBB8_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-NEXT: .LBB8_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-NEXT: .LBB8_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: .LBB8_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: .LBB8_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: .LBB8_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: .LBB8_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: .LBB8_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-NEXT: .LBB10_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-NEXT: .LBB10_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-NEXT: .LBB10_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-NEXT: .LBB10_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-NEXT: .LBB10_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-NEXT: .LBB10_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 314c52a..49d415c 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -13,6 +13,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() +declare float @div.double.value() define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe: @@ -3550,6 +3551,3965 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop ret void } +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2 +; GFX7LESS-NEXT: .LBB6_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-NEXT: .LBB6_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-NEXT: .LBB6_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-NEXT: .LBB6_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-NEXT: .LBB6_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-NEXT: .LBB6_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX9-DPP-NEXT: .LBB6_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1064-DPP-NEXT: .LBB6_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1032-DPP-NEXT: .LBB6_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1164-DPP-NEXT: .LBB6_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2 +; GFX1132-DPP-NEXT: .LBB6_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2 +; GFX7LESS-NEXT: .LBB8_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-NEXT: .LBB8_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-NEXT: .LBB8_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-NEXT: .LBB8_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-NEXT: .LBB8_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-NEXT: .LBB8_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX9-DPP-NEXT: .LBB8_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1064-DPP-NEXT: .LBB8_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1032-DPP-NEXT: .LBB8_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1164-DPP-NEXT: .LBB8_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2 +; GFX1132-DPP-NEXT: .LBB8_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35] +; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35] +; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2 +; GFX7LESS-NEXT: .LBB10_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-NEXT: .LBB10_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-NEXT: .LBB10_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-NEXT: .LBB10_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-NEXT: .LBB10_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-NEXT: .LBB10_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX9-DPP-NEXT: .LBB10_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1064-DPP-NEXT: .LBB10_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1032-DPP-NEXT: .LBB10_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1164-DPP-NEXT: .LBB10_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2 +; GFX1132-DPP-NEXT: .LBB10_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43] +; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43] +; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } !llvm.module.flags = !{!0} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index bc9125e..7a7ddbe 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -13,6 +13,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s declare float @div.float.value() +declare double @div.double.value() define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { ; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe: @@ -5616,6 +5617,5581 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop ret void } +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2 +; GFX7LESS-NEXT: .LBB9_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-NEXT: .LBB9_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-NEXT: .LBB9_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-NEXT: .LBB9_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-NEXT: .LBB9_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-NEXT: .LBB9_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX9-DPP-NEXT: .LBB9_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1064-DPP-NEXT: .LBB9_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1032-DPP-NEXT: .LBB9_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1164-DPP-NEXT: .LBB9_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2 +; GFX1132-DPP-NEXT: .LBB9_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2 +; GFX7LESS-NEXT: .LBB11_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-NEXT: .LBB11_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-NEXT: .LBB11_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-NEXT: .LBB11_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-NEXT: .LBB11_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-NEXT: .LBB11_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX9-DPP-NEXT: .LBB11_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1064-DPP-NEXT: .LBB11_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1032-DPP-NEXT: .LBB11_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1164-DPP-NEXT: .LBB11_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2 +; GFX1132-DPP-NEXT: .LBB11_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic + ret void +} +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() strictfp + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{ +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s12, s12, s3 +; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0 +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3] +; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2 +; GFX7LESS-NEXT: .LBB13_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-NEXT: .LBB13_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-NEXT: .LBB13_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-NEXT: .LBB13_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-NEXT: scratch_store_b32 off, v1, off +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-NEXT: .LBB13_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-NEXT: scratch_store_b32 off, v1, off +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-NEXT: .LBB13_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s10, -1 +; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX9-DPP-NEXT: .LBB13_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1064-DPP-NEXT: .LBB13_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s10, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3 +; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5 +; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1032-DPP-NEXT: .LBB13_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1164-DPP-NEXT: .LBB13_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2 +; GFX1132-DPP-NEXT: .LBB13_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.double.value() + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b32 s32, 0 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s9 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s14, s8 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s38, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s2, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[2:3] +; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0 +; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5 +; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2 +; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5] +; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s38, -1 +; GFX9-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-NEXT: s_add_u32 s36, s36, s9 +; GFX9-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-NEXT: s_mov_b32 s14, s8 +; GFX9-NEXT: s_add_u32 s8, s2, 44 +; GFX9-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-NEXT: s_getpc_b64 s[2:3] +; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 +; GFX9-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s38, -1 +; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-NEXT: s_mov_b32 s14, s8 +; GFX1064-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-NEXT: s_getpc_b64 s[4:5] +; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s6 +; GFX1064-NEXT: s_mov_b32 s13, s7 +; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-NEXT: s_mov_b32 s32, 0 +; GFX1064-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s38, -1 +; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-NEXT: s_mov_b32 s14, s8 +; GFX1032-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-NEXT: s_getpc_b64 s[4:5] +; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s6 +; GFX1032-NEXT: s_mov_b32 s13, s7 +; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-NEXT: s_mov_b32 s32, 0 +; GFX1032-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-NEXT: s_mov_b32 s0, 0 +; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b32 s14, s8 +; GFX1164-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-NEXT: s_getpc_b64 s[4:5] +; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s32, 0 +; GFX1164-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-NEXT: s_getpc_b64 s[4:5] +; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-NEXT: s_mov_b32 s0, 0 +; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s38, -1 +; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX9-DPP-NEXT: s_mov_b32 s14, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX9-DPP-NEXT: s_getpc_b64 s[2:3] +; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b32 s12, s6 +; GFX9-DPP-NEXT: s_mov_b32 s13, s7 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-DPP-NEXT: s_mov_b32 s32, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s38, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9 +; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35] +; GFX1032-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b32 s14, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0 +; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2 +; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5] +; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35] +; GFX1132-DPP-NEXT: s_mov_b32 s0, 0 +; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1] +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() strictfp + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic + ret void +} +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s42, -1 +; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s40, s40, s3 +; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s2 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s34, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b32 s12, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2 +; GFX7LESS-NEXT: .LBB16_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s42, -1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-NEXT: s_mov_b64 s[0:1], exec +; GFX9-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-NEXT: s_add_u32 s40, s40, s3 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-NEXT: s_mov_b32 s33, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-NEXT: s_add_u32 s8, s34, 44 +; GFX9-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-NEXT: s_mov_b32 s12, s33 +; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-NEXT: .LBB16_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s42, -1 +; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-NEXT: s_mov_b32 s33, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-NEXT: s_mov_b32 s12, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-NEXT: .LBB16_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s33, s2 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s42, -1 +; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-NEXT: s_mov_b32 s38, 0 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-NEXT: s_mov_b32 s12, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-NEXT: .LBB16_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-NEXT: s_mov_b32 s33, s2 +; GFX1164-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b32 s12, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-NEXT: .LBB16_3: +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-NEXT: s_mov_b32 s38, 0 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b32 s12, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-NEXT: .LBB16_3: +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s42, -1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000 +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX9-DPP-NEXT: s_mov_b32 s33, s2 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000 +; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1] +; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: s_mov_b32 s12, s33 +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX9-DPP-NEXT: .LBB16_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3] +; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1064-DPP-NEXT: .LBB16_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s42, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3 +; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1032-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24 +; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0 +; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43] +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1032-DPP-NEXT: .LBB16_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1164-DPP-NEXT: s_mov_b32 s33, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1164-DPP-NEXT: .LBB16_3: +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1] +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20 +; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16 +; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16 +; GFX1132-DPP-NEXT: s_mov_b32 s38, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1] +; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1] +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s12, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2 +; GFX1132-DPP-NEXT: .LBB16_3: +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 { +; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_movk_i32 s32, 0x800 +; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX7LESS-NEXT: s_mov_b32 s50, -1 +; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000 +; GFX7LESS-NEXT: s_add_u32 s48, s48, s9 +; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0 +; GFX7LESS-NEXT: s_mov_b32 s33, s8 +; GFX7LESS-NEXT: s_mov_b32 s40, s7 +; GFX7LESS-NEXT: s_mov_b32 s41, s6 +; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s46, -1 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1 +; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0 +; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0 +; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41] +; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: s_add_u32 s8, s36, 44 +; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12 +; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8 +; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0 +; GFX7LESS-NEXT: s_getpc_b64 s[0:1] +; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX7LESS-NEXT: s_waitcnt expcnt(2) +; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8 +; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX7LESS-NEXT: s_mov_b32 s12, s41 +; GFX7LESS-NEXT: s_mov_b32 s13, s40 +; GFX7LESS-NEXT: s_mov_b32 s14, s33 +; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42 +; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0 +; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0 +; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43] +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s50, -1 +; GFX9-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-NEXT: s_add_u32 s48, s48, s9 +; GFX9-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-NEXT: s_mov_b32 s33, s8 +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-NEXT: s_mov_b32 s40, s7 +; GFX9-NEXT: s_mov_b32 s41, s6 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: s_movk_i32 s32, 0x800 +; GFX9-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-NEXT: s_add_u32 s8, s36, 44 +; GFX9-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-NEXT: s_getpc_b64 s[0:1] +; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-NEXT: s_mov_b32 s12, s41 +; GFX9-NEXT: s_mov_b32 s13, s40 +; GFX9-NEXT: s_mov_b32 s14, s33 +; GFX9-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-NEXT: s_mov_b32 s50, -1 +; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-NEXT: s_mov_b32 s33, s8 +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-NEXT: s_mov_b32 s40, s7 +; GFX1064-NEXT: s_mov_b32 s41, s6 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-NEXT: s_getpc_b64 s[0:1] +; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-NEXT: s_mov_b32 s12, s41 +; GFX1064-NEXT: s_mov_b32 s13, s40 +; GFX1064-NEXT: s_mov_b32 s14, s33 +; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-NEXT: s_mov_b32 s50, -1 +; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-NEXT: s_mov_b32 s33, s8 +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-NEXT: s_mov_b32 s40, s7 +; GFX1032-NEXT: s_mov_b32 s41, s6 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-NEXT: s_mov_b32 s44, 0 +; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-NEXT: s_getpc_b64 s[0:1] +; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-NEXT: s_mov_b32 s12, s41 +; GFX1032-NEXT: s_mov_b32 s13, s40 +; GFX1032-NEXT: s_mov_b32 s14, s33 +; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-NEXT: s_mov_b32 s33, s8 +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s6 +; GFX1164-NEXT: s_mov_b32 s13, s7 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_mov_b32 s32, 32 +; GFX1164-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-NEXT: s_mov_b32 s40, s7 +; GFX1164-NEXT: s_mov_b32 s41, s6 +; GFX1164-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-NEXT: .p2align 6 +; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-NEXT: s_getpc_b64 s[0:1] +; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-NEXT: s_mov_b32 s12, s41 +; GFX1164-NEXT: s_mov_b32 s13, s40 +; GFX1164-NEXT: s_mov_b32 s14, s33 +; GFX1164-NEXT: s_clause 0x1 +; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-NEXT: s_mov_b32 s40, s14 +; GFX1132-NEXT: s_mov_b32 s41, s13 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s13 +; GFX1132-NEXT: s_mov_b32 s13, s14 +; GFX1132-NEXT: s_mov_b32 s14, s15 +; GFX1132-NEXT: s_mov_b32 s32, 32 +; GFX1132-NEXT: s_mov_b32 s33, s15 +; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-NEXT: s_mov_b32 s44, 0 +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-NEXT: .p2align 6 +; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-NEXT: s_getpc_b64 s[0:1] +; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-NEXT: s_mov_b32 s12, s41 +; GFX1132-NEXT: s_mov_b32 s13, s40 +; GFX1132-NEXT: s_mov_b32 s14, s33 +; GFX1132-NEXT: s_clause 0x1 +; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX9-DPP-NEXT: s_mov_b32 s50, -1 +; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000 +; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3] +; GFX9-DPP-NEXT: s_mov_b32 s33, s8 +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_mov_b32 s40, s7 +; GFX9-DPP-NEXT: s_mov_b32 s41, s6 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5] +; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44 +; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0 +; GFX9-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35] +; GFX9-DPP-NEXT: s_mov_b32 s12, s41 +; GFX9-DPP-NEXT: s_mov_b32 s13, s40 +; GFX9-DPP-NEXT: s_mov_b32 s14, s33 +; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1064-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000 +; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1064-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1064-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1064-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1064-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1064-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1064-DPP-NEXT: s_clause 0x1 +; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 +; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 +; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9 +; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1032-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43] +; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1032-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49] +; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1032-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1032-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1032-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51] +; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 +; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7] +; GFX1032-DPP-NEXT: s_clause 0x1 +; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 +; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1164-DPP-NEXT: s_mov_b32 s33, s8 +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s6 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0 +; GFX1164-DPP-NEXT: s_mov_b32 s40, s7 +; GFX1164-DPP-NEXT: s_mov_b32 s41, s6 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1 +; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0 +; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0 +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1164-DPP-NEXT: .p2align 6 +; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42 +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1164-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1164-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1164-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1164-DPP-NEXT: s_clause 0x1 +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45] +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3] +; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24 +; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5] +; GFX1132-DPP-NEXT: s_mov_b32 s40, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s41, s13 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s13 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s14 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s15 +; GFX1132-DPP-NEXT: s_mov_b32 s32, 32 +; GFX1132-DPP-NEXT: s_mov_b32 s33, s15 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1 +; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43] +; GFX1132-DPP-NEXT: s_mov_b32 s44, 0 +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1 +; GFX1132-DPP-NEXT: .p2align 6 +; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41] +; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44 +; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0 +; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1] +; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 +; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8 +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0 +; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39] +; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37] +; GFX1132-DPP-NEXT: s_mov_b32 s12, s41 +; GFX1132-DPP-NEXT: s_mov_b32 s13, s40 +; GFX1132-DPP-NEXT: s_mov_b32 s14, s33 +; GFX1132-DPP-NEXT: s_clause 0x1 +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off +; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42 +; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off +; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44 +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1 +; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2 +; GFX1132-DPP-NEXT: s_endpgm + %divValue = call double @div.float.value() strictfp + %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 4 + ret void +} + attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #2 = { strictfp} diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir index bdd89a9..dde84af 100644 --- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir @@ -13,6 +13,7 @@ name: greedy_fail_alloc_sgpr1024_spill tracksRegLiveness: true frameInfo: + adjustsStack: true hasCalls: true machineFunctionInfo: explicitKernArgSize: 16 diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll index a5792bf..4c21f87 100644 --- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll @@ -258,25 +258,25 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo ;. ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V4: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V4: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V5: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V5: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } -; V6: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } -; V6: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } +; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" } ;. ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll index e015095a..ab160ff 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll @@ -92,7 +92,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b ; DAGISEL-GFX11-NEXT: $vgpr5 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr6 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr7 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -122,7 +121,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b ; DAGISEL-GFX10-NEXT: $vgpr5 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr6 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr7 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -234,7 +232,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -272,7 +269,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -404,7 +400,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in ; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr12 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -454,7 +449,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in ; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr12 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -506,7 +500,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b) ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -524,7 +517,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b) ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -576,7 +568,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) { ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -594,7 +585,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) { ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -646,7 +636,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat % ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -664,7 +653,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat % ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -716,7 +704,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) { ; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -734,7 +721,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) { ; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -870,7 +856,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16 ; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr14 = COPY [[COPY1]] ; DAGISEL-GFX11-NEXT: $vgpr15 = COPY [[COPY]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -916,7 +901,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16 ; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr14 = COPY [[COPY1]] ; DAGISEL-GFX10-NEXT: $vgpr15 = COPY [[COPY]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 @@ -2480,7 +2464,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128 ; DAGISEL-GFX11-NEXT: $vgpr29 = COPY [[COPY134]] ; DAGISEL-GFX11-NEXT: $vgpr30 = COPY [[COPY133]] ; DAGISEL-GFX11-NEXT: $vgpr31 = COPY [[COPY132]] - ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31 ; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX11-NEXT: S_ENDPGM 0 @@ -2827,7 +2810,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128 ; DAGISEL-GFX10-NEXT: $vgpr29 = COPY [[COPY134]] ; DAGISEL-GFX10-NEXT: $vgpr30 = COPY [[COPY133]] ; DAGISEL-GFX10-NEXT: $vgpr31 = COPY [[COPY132]] - ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31 ; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; DAGISEL-GFX10-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll new file mode 100644 index 0000000..bfeb214 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -0,0 +1,1618 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s + +define float @sitofp_i128_to_f32(i128 %x) { +; SDAG-LABEL: sitofp_i128_to_f32: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB0_14 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 +; SDAG-NEXT: v_xor_b32_e32 v1, v5, v1 +; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 +; SDAG-NEXT: v_xor_b32_e32 v2, v5, v2 +; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v5, v3 +; SDAG-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v2, v4 +; SDAG-NEXT: v_add_u32_e32 v2, 32, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v5 +; SDAG-NEXT: v_min_u32_e32 v2, v2, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v0 +; SDAG-NEXT: v_add_u32_e32 v6, 32, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v7, v1 +; SDAG-NEXT: v_min_u32_e32 v6, v6, v7 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc +; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; %bb.3: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_13 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB0_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_cbranch_execz .LBB0_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7 +; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 +; SDAG-NEXT: v_lshrrev_b64 v[8:9], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] +; SDAG-NEXT: v_sub_u32_e32 v13, 38, v7 +; SDAG-NEXT: v_or_b32_e32 v11, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; SDAG-NEXT: v_add_u32_e32 v14, 26, v7 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SDAG-NEXT: v_lshrrev_b64 v[10:11], v13, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[12:13], v14, v[4:5] +; SDAG-NEXT: v_subrev_u32_e32 v7, 38, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v15, v8, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[7:8], v7, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v11, v13, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v12, v10 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v5 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v8 +; SDAG-NEXT: v_mov_b32_e32 v1, v9 +; SDAG-NEXT: .LBB0_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB0_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v2, v6 +; SDAG-NEXT: ; %bb.12: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB0_13: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 +; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 +; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 +; SDAG-NEXT: v_or3_b32 v4, v2, v0, v1 +; SDAG-NEXT: .LBB0_14: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sitofp_i128_to_f32: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB0_14 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0 +; GISEL-NEXT: v_xor_b32_e32 v1, v6, v1 +; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v7, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v7, 32, v7 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: ; implicit-def: $vgpr4 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: ; %bb.3: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB0_13 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB0_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB0_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4 +; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v12, v10, v12 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1 +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 +; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5 +; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 +; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB0_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB0_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GISEL-NEXT: ; %bb.12: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB0_13: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 +; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: v_or3_b32 v4, v2, v0, v1 +; GISEL-NEXT: .LBB0_14: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = sitofp i128 %x to float + ret float %cvt +} + +define float @uitofp_i128_to_f32(i128 %x) { +; SDAG-LABEL: uitofp_i128_to_f32: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB1_14 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 +; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v3 +; SDAG-NEXT: v_min_u32_e32 v4, v4, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v0 +; SDAG-NEXT: v_add_u32_e32 v5, 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v1 +; SDAG-NEXT: v_min_u32_e32 v5, v5, v6 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 +; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: ; implicit-def: $vgpr7 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: ; %bb.3: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_13 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB1_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_cbranch_execz .LBB1_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6 +; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 +; SDAG-NEXT: v_lshrrev_b64 v[7:8], v11, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] +; SDAG-NEXT: v_sub_u32_e32 v12, 38, v6 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v7, v9 +; SDAG-NEXT: v_lshrrev_b64 v[7:8], v12, v[2:3] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; SDAG-NEXT: v_add_u32_e32 v13, 26, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; SDAG-NEXT: v_lshrrev_b64 v[9:10], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[11:12], v13, v[2:3] +; SDAG-NEXT: v_subrev_u32_e32 v6, 38, v6 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v7, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v10, v12, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v13, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v7 +; SDAG-NEXT: v_mov_b32_e32 v1, v8 +; SDAG-NEXT: .LBB1_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB1_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v4, v5 +; SDAG-NEXT: ; %bb.12: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB1_13: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 +; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 +; SDAG-NEXT: v_add_u32_e32 v4, 1.0, v0 +; SDAG-NEXT: .LBB1_14: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: uitofp_i128_to_f32: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB1_14 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v6, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: ; implicit-def: $vgpr4 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: ; %bb.3: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB1_13 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB1_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB1_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4 +; GISEL-NEXT: v_or_b32_e32 v10, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1 +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1 +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5 +; GISEL-NEXT: v_or_b32_e32 v14, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 +; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v8, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v9, v3 +; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v12, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB1_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB1_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GISEL-NEXT: ; %bb.12: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB1_13: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff +; GISEL-NEXT: v_and_or_b32 v4, v4, v1, v0 +; GISEL-NEXT: .LBB1_14: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = uitofp i128 %x to float + ret float %cvt +} + +define double @sitofp_i128_to_f64(i128 %x) { +; SDAG-LABEL: sitofp_i128_to_f64: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_or_b32_e32 v1, v5, v3 +; SDAG-NEXT: v_or_b32_e32 v0, v4, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB2_14 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ashrrev_i32_e32 v0, 31, v3 +; SDAG-NEXT: v_xor_b32_e32 v4, v0, v4 +; SDAG-NEXT: v_xor_b32_e32 v5, v0, v5 +; SDAG-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v0 +; SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 +; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v0, vcc +; SDAG-NEXT: v_xor_b32_e32 v1, v0, v3 +; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v0, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v0, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v0, v6 +; SDAG-NEXT: v_add_u32_e32 v0, 32, v0 +; SDAG-NEXT: v_ffbh_u32_e32 v1, v7 +; SDAG-NEXT: v_min_u32_e32 v0, v0, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v1, v4 +; SDAG-NEXT: v_add_u32_e32 v1, 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v2, v5 +; SDAG-NEXT: v_min_u32_e32 v1, v1, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_add_u32_e32 v1, 64, v1 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc +; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v8 +; SDAG-NEXT: ; implicit-def: $vgpr10 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v6, 0xffffffb5, v9 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; %bb.3: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_13 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v8 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB2_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_cbranch_execz .LBB2_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v12, 0x49, v9 +; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v12, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] +; SDAG-NEXT: v_sub_u32_e32 v13, 9, v9 +; SDAG-NEXT: v_or_b32_e32 v11, v1, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v0, v10 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], v13, v[6:7] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; SDAG-NEXT: v_add_u32_e32 v16, 55, v9 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc +; SDAG-NEXT: v_lshrrev_b64 v[10:11], v12, v[6:7] +; SDAG-NEXT: v_lshrrev_b64 v[12:13], v13, v[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[14:15], v16, v[6:7] +; SDAG-NEXT: v_add_u32_e32 v9, -9, v9 +; SDAG-NEXT: v_or_b32_e32 v15, v15, v13 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 +; SDAG-NEXT: v_lshlrev_b64 v[12:13], v9, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v9, v13, v15, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; SDAG-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SDAG-NEXT: v_or_b32_e32 v5, v5, v7 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v6 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v6, v10 +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v5, v1 +; SDAG-NEXT: v_mov_b32_e32 v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v7, v11 +; SDAG-NEXT: .LBB2_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB2_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 31, v5 +; SDAG-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] +; SDAG-NEXT: v_or_b32_e32 v6, v6, v0 +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v4 +; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4 +; SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; SDAG-NEXT: v_lshrrev_b64 v[0:1], 2, v[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v7, 30, v6 +; SDAG-NEXT: v_or_b32_e32 v10, v1, v7 +; SDAG-NEXT: v_and_b32_e32 v1, 0x800000, v5 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_lshrrev_b64 v[0:1], 3, v[4:5] +; SDAG-NEXT: v_lshlrev_b32_e32 v2, 29, v6 +; SDAG-NEXT: v_or_b32_e32 v10, v1, v2 +; SDAG-NEXT: v_mov_b32_e32 v2, v8 +; SDAG-NEXT: ; %bb.12: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB2_13: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v1, 0x80000000, v3 +; SDAG-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; SDAG-NEXT: v_lshl_add_u32 v2, v2, 20, v3 +; SDAG-NEXT: v_and_b32_e32 v3, 0xfffff, v10 +; SDAG-NEXT: v_or3_b32 v1, v3, v1, v2 +; SDAG-NEXT: .LBB2_14: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sitofp_i128_to_f64: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v4, v0 +; GISEL-NEXT: v_mov_b32_e32 v5, v1 +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_or_b32_e32 v0, v4, v2 +; GISEL-NEXT: v_or_b32_e32 v1, v5, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB2_14 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v6, v4 +; GISEL-NEXT: v_xor_b32_e32 v1, v6, v5 +; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v7, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v7, 32, v7 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v9 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8 +; GISEL-NEXT: ; implicit-def: $vgpr10 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v9 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; %bb.3: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB2_13 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB2_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB2_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v14, 0x49, v9 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v14 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GISEL-NEXT: v_or_b32_e32 v10, v4, v10 +; GISEL-NEXT: v_or_b32_e32 v11, v5, v11 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3] +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GISEL-NEXT: v_add_u32_e32 v14, 55, v9 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e64 v13, v4, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v14, -1 +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 +; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GISEL-NEXT: v_or_b32_e32 v16, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v17, v10, v12 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v15, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 +; GISEL-NEXT: v_and_or_b32 v0, v11, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB2_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB2_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GISEL-NEXT: v_or_b32_e32 v11, v2, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v9 +; GISEL-NEXT: v_mov_b32_e32 v1, v10 +; GISEL-NEXT: v_mov_b32_e32 v2, v11 +; GISEL-NEXT: v_mov_b32_e32 v3, v12 +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v3, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v9, 0 +; GISEL-NEXT: v_and_b32_e32 v10, 0x800000, v1 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] +; GISEL-NEXT: v_lshl_or_b32 v10, v2, 30, v5 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GISEL-NEXT: v_lshl_or_b32 v10, v2, 29, v5 +; GISEL-NEXT: ; %bb.12: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB2_13: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 +; GISEL-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff +; GISEL-NEXT: v_lshl_add_u32 v1, v7, 20, v1 +; GISEL-NEXT: v_and_or_b32 v2, v10, v2, v0 +; GISEL-NEXT: v_and_or_b32 v0, v4, -1, 0 +; GISEL-NEXT: v_or3_b32 v1, v2, v1, 0 +; GISEL-NEXT: .LBB2_14: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = sitofp i128 %x to double + ret double %cvt +} + +define double @uitofp_i128_to_f64(i128 %x) { +; SDAG-LABEL: uitofp_i128_to_f64: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: v_mov_b32_e32 v5, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB3_14 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 +; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v3 +; SDAG-NEXT: v_min_u32_e32 v4, v4, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v0 +; SDAG-NEXT: v_add_u32_e32 v5, 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v1 +; SDAG-NEXT: v_min_u32_e32 v5, v5, v6 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 +; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc +; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8 +; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v7 +; SDAG-NEXT: ; implicit-def: $vgpr9 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: ; %bb.3: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_13 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v7 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB3_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_cbranch_execz .LBB3_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v11, 0x49, v8 +; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v11, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] +; SDAG-NEXT: v_sub_u32_e32 v12, 9, v8 +; SDAG-NEXT: v_or_b32_e32 v10, v5, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v4, v9 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], v12, v[2:3] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; SDAG-NEXT: v_add_u32_e32 v15, 55, v8 +; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 +; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; SDAG-NEXT: v_lshrrev_b64 v[9:10], v11, v[2:3] +; SDAG-NEXT: v_lshrrev_b64 v[11:12], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[13:14], v15, v[2:3] +; SDAG-NEXT: v_add_u32_e32 v8, -9, v8 +; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 +; SDAG-NEXT: v_or_b32_e32 v13, v13, v11 +; SDAG-NEXT: v_lshlrev_b64 v[11:12], v8, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v12, v14, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v11, v13, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_mov_b32_e32 v2, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v4, v4, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: v_mov_b32_e32 v3, v10 +; SDAG-NEXT: .LBB3_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB3_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] +; SDAG-NEXT: v_lshrrev_b32_e32 v3, 31, v1 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: v_or_b32_e32 v2, v2, v3 +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v3, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v3, 1, v0 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; SDAG-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; SDAG-NEXT: v_and_b32_e32 v3, 0x800000, v1 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 3 +; SDAG-NEXT: v_mov_b32_e32 v6, v7 +; SDAG-NEXT: ; %bb.12: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB3_13: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v9 +; SDAG-NEXT: v_lshl_or_b32 v0, v6, 20, v0 +; SDAG-NEXT: v_add_u32_e32 v5, 0x3ff00000, v0 +; SDAG-NEXT: .LBB3_14: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: v_mov_b32_e32 v1, v5 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: uitofp_i128_to_f64: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: s_mov_b64 s[4:5], 0 +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB3_14 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v6, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v8 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v8 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v7 +; GISEL-NEXT: ; implicit-def: $vgpr9 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; GISEL-NEXT: ; implicit-def: $vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; %bb.3: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB3_13 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v7 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB3_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB3_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v13, 0x49, v8 +; GISEL-NEXT: v_sub_u32_e32 v9, 64, v13 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v13, v[2:3] +; GISEL-NEXT: v_or_b32_e32 v9, v4, v9 +; GISEL-NEXT: v_or_b32_e32 v10, v5, v10 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_add_u32_e32 v8, 55, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc +; GISEL-NEXT: v_sub_u32_e32 v12, 64, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v14, v4, v0, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v9, v5, v1, s[4:5] +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v8, -1 +; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1 +; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v8 +; GISEL-NEXT: v_or_b32_e32 v16, v4, v12 +; GISEL-NEXT: v_or_b32_e32 v17, v5, v13 +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v8, v12, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v4, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v5, v3 +; GISEL-NEXT: v_and_or_b32 v0, v8, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v8, v14, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mov_b32_e32 v3, v11 +; GISEL-NEXT: .LBB3_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB3_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1 +; GISEL-NEXT: v_or_b32_e32 v10, v10, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, v9 +; GISEL-NEXT: v_mov_b32_e32 v2, v10 +; GISEL-NEXT: v_mov_b32_e32 v3, v11 +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v4, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GISEL-NEXT: v_mov_b32_e32 v8, 0 +; GISEL-NEXT: v_and_b32_e32 v9, 0x800000, v1 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GISEL-NEXT: v_lshlrev_b64 v[8:9], 30, v[2:3] +; GISEL-NEXT: v_lshrrev_b32_e32 v5, 2, v1 +; GISEL-NEXT: v_or_b32_e32 v9, v5, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshlrev_b64 v[2:3], 29, v[2:3] +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_lshrrev_b32_e32 v0, 3, v1 +; GISEL-NEXT: v_or_b32_e32 v9, v0, v2 +; GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GISEL-NEXT: ; %bb.12: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB3_13: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000 +; GISEL-NEXT: v_lshl_add_u32 v0, v6, 20, v0 +; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9 +; GISEL-NEXT: v_and_or_b32 v4, v4, -1, 0 +; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0 +; GISEL-NEXT: .LBB3_14: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GISEL-NEXT: v_mov_b32_e32 v1, v5 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = uitofp i128 %x to double + ret double %cvt +} + +define half @sitofp_i128_to_f16(i128 %x) { +; SDAG-LABEL: sitofp_i128_to_f16: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB4_14 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0 +; SDAG-NEXT: v_xor_b32_e32 v1, v5, v1 +; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5 +; SDAG-NEXT: v_xor_b32_e32 v2, v5, v2 +; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; SDAG-NEXT: v_xor_b32_e32 v6, v5, v3 +; SDAG-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc +; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v2, v4 +; SDAG-NEXT: v_add_u32_e32 v2, 32, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v5 +; SDAG-NEXT: v_min_u32_e32 v2, v2, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v0 +; SDAG-NEXT: v_add_u32_e32 v6, 32, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v7, v1 +; SDAG-NEXT: v_min_u32_e32 v6, v6, v7 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_add_u32_e32 v6, 64, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc +; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7 +; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: ; implicit-def: $vgpr8 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr7 +; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; SDAG-NEXT: ; %bb.3: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB4_13 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB4_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_cbranch_execz .LBB4_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7 +; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12 +; SDAG-NEXT: v_lshrrev_b64 v[8:9], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] +; SDAG-NEXT: v_sub_u32_e32 v13, 38, v7 +; SDAG-NEXT: v_or_b32_e32 v11, v9, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12 +; SDAG-NEXT: v_add_u32_e32 v14, 26, v7 +; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12 +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SDAG-NEXT: v_lshrrev_b64 v[10:11], v13, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[12:13], v14, v[4:5] +; SDAG-NEXT: v_subrev_u32_e32 v7, 38, v7 +; SDAG-NEXT: v_cndmask_b32_e64 v15, v8, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[7:8], v7, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v11, v13, v11 +; SDAG-NEXT: v_or_b32_e32 v10, v12, v10 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v5 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v8, v15, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v8 +; SDAG-NEXT: v_mov_b32_e32 v1, v9 +; SDAG-NEXT: .LBB4_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB4_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 +; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v2, v6 +; SDAG-NEXT: ; %bb.12: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB4_13: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3 +; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0 +; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8 +; SDAG-NEXT: v_or3_b32 v0, v2, v0, v1 +; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SDAG-NEXT: .LBB4_14: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sitofp_i128_to_f16: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB4_14 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0 +; GISEL-NEXT: v_xor_b32_e32 v1, v6, v1 +; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6 +; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2 +; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc +; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3 +; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v7, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v7, 32, v7 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v7 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5 +; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8 +; GISEL-NEXT: ; implicit-def: $vgpr4 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr8 +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: ; %bb.3: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB4_13 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB4_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB4_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4 +; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v12, v10, v12 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1 +; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 +; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5 +; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 +; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 +; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v13, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB4_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB4_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v7, v8 +; GISEL-NEXT: ; %bb.12: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB4_13: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6 +; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0 +; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4 +; GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 +; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GISEL-NEXT: .LBB4_14: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = sitofp i128 %x to half + ret half %cvt +} + +define half @uitofp_i128_to_f16(i128 %x) { +; SDAG-LABEL: uitofp_i128_to_f16: +; SDAG: ; %bb.0: ; %itofp-entry +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_or_b32_e32 v5, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v4, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_mov_b32_e32 v4, 0 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SDAG-NEXT: s_cbranch_execz .LBB5_14 +; SDAG-NEXT: ; %bb.1: ; %itofp-if-end +; SDAG-NEXT: v_ffbh_u32_e32 v4, v2 +; SDAG-NEXT: v_add_u32_e32 v4, 32, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v3 +; SDAG-NEXT: v_min_u32_e32 v4, v4, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v5, v0 +; SDAG-NEXT: v_add_u32_e32 v5, 32, v5 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v1 +; SDAG-NEXT: v_min_u32_e32 v5, v5, v6 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_add_u32_e32 v5, 64, v5 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc +; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6 +; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6 +; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: ; implicit-def: $vgpr7 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SDAG-NEXT: ; %bb.2: ; %itofp-if-else +; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc +; SDAG-NEXT: ; implicit-def: $vgpr5 +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: ; implicit-def: $vgpr6 +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: ; %bb.3: ; %Flow3 +; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB5_13 +; SDAG-NEXT: ; %bb.4: ; %NodeBlock +; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; SDAG-NEXT: s_cbranch_execz .LBB5_8 +; SDAG-NEXT: ; %bb.5: ; %LeafBlock +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5 +; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SDAG-NEXT: s_cbranch_execz .LBB5_7 +; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default +; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6 +; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11 +; SDAG-NEXT: v_lshrrev_b64 v[7:8], v11, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] +; SDAG-NEXT: v_sub_u32_e32 v12, 38, v6 +; SDAG-NEXT: v_or_b32_e32 v10, v8, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v7, v9 +; SDAG-NEXT: v_lshrrev_b64 v[7:8], v12, v[2:3] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 +; SDAG-NEXT: v_add_u32_e32 v13, 26, v6 +; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11 +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc +; SDAG-NEXT: v_lshrrev_b64 v[9:10], v12, v[0:1] +; SDAG-NEXT: v_lshlrev_b64 v[11:12], v13, v[2:3] +; SDAG-NEXT: v_subrev_u32_e32 v6, 38, v6 +; SDAG-NEXT: v_cndmask_b32_e64 v14, v7, v0, s[4:5] +; SDAG-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5] +; SDAG-NEXT: v_or_b32_e32 v10, v12, v10 +; SDAG-NEXT: v_or_b32_e32 v9, v11, v9 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; SDAG-NEXT: v_lshlrev_b64 v[0:1], v13, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 +; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v7, v14, v0 +; SDAG-NEXT: v_mov_b32_e32 v0, v7 +; SDAG-NEXT: v_mov_b32_e32 v1, v8 +; SDAG-NEXT: .LBB5_7: ; %Flow1 +; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] +; SDAG-NEXT: .LBB5_8: ; %Flow2 +; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb +; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0 +; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0 +; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2 +; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20 +; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3 +; SDAG-NEXT: v_mov_b32_e32 v4, v5 +; SDAG-NEXT: ; %bb.12: ; %Flow +; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: .LBB5_13: ; %Flow4 +; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] +; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7 +; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0 +; SDAG-NEXT: v_add_u32_e32 v0, 1.0, v0 +; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SDAG-NEXT: .LBB5_14: ; %Flow5 +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] +; SDAG-NEXT: v_mov_b32_e32 v0, v4 +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: uitofp_i128_to_f16: +; GISEL: ; %bb.0: ; %itofp-entry +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_or_b32_e32 v4, v0, v2 +; GISEL-NEXT: v_or_b32_e32 v5, v1, v3 +; GISEL-NEXT: s_mov_b32 s4, 0 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GISEL-NEXT: s_cbranch_execz .LBB5_14 +; GISEL-NEXT: ; %bb.1: ; %itofp-if-end +; GISEL-NEXT: v_ffbh_u32_e32 v5, v0 +; GISEL-NEXT: v_ffbh_u32_e32 v4, v1 +; GISEL-NEXT: v_add_u32_e32 v5, 32, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v6, v2 +; GISEL-NEXT: v_min_u32_e32 v4, v4, v5 +; GISEL-NEXT: v_ffbh_u32_e32 v5, v3 +; GISEL-NEXT: v_add_u32_e32 v6, 32, v6 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_add_u32_e32 v4, 64, v4 +; GISEL-NEXT: v_min_u32_e32 v5, v5, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc +; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5 +; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5 +; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7 +; GISEL-NEXT: ; implicit-def: $vgpr4 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GISEL-NEXT: ; %bb.2: ; %itofp-if-else +; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GISEL-NEXT: ; implicit-def: $vgpr7 +; GISEL-NEXT: ; implicit-def: $vgpr0 +; GISEL-NEXT: ; implicit-def: $vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr2 +; GISEL-NEXT: ; %bb.3: ; %Flow3 +; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB5_13 +; GISEL-NEXT: ; %bb.4: ; %NodeBlock +; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7 +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5] +; GISEL-NEXT: s_cbranch_execz .LBB5_8 +; GISEL-NEXT: ; %bb.5: ; %LeafBlock +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7 +; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GISEL-NEXT: s_cbranch_execz .LBB5_7 +; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default +; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1] +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] +; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4 +; GISEL-NEXT: v_or_b32_e32 v10, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1 +; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1 +; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5 +; GISEL-NEXT: v_or_b32_e32 v14, v8, v10 +; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 +; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5] +; GISEL-NEXT: v_and_b32_e32 v2, v8, v2 +; GISEL-NEXT: v_and_b32_e32 v3, v9, v3 +; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 +; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3 +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GISEL-NEXT: v_or_b32_e32 v3, v12, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-NEXT: v_mov_b32_e32 v1, v4 +; GISEL-NEXT: v_mov_b32_e32 v2, v5 +; GISEL-NEXT: v_mov_b32_e32 v3, v6 +; GISEL-NEXT: .LBB5_7: ; %Flow1 +; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] +; GISEL-NEXT: .LBB5_8: ; %Flow2 +; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb +; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1 +; GISEL-NEXT: v_or_b32_e32 v0, v0, v2 +; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0 +; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1] +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v6, v7 +; GISEL-NEXT: ; %bb.12: ; %Flow +; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] +; GISEL-NEXT: .LBB5_13: ; %Flow4 +; GISEL-NEXT: s_or_b64 exec, exec, s[8:9] +; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0 +; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff +; GISEL-NEXT: v_and_or_b32 v0, v4, v1, v0 +; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0 +; GISEL-NEXT: .LBB5_14: ; %Flow5 +; GISEL-NEXT: s_or_b64 exec, exec, s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, v4 +; GISEL-NEXT: s_setpc_b64 s[30:31] + %cvt = uitofp i128 %x to half + ret half %cvt +} + +; FIXME: ExpandLargeFpConvert asserts on bfloat +; define bfloat @sitofp_i128_to_bf16(i128 %x) { +; %cvt = sitofp i128 %x to bfloat +; ret bfloat %cvt +; } + +; define bfloat @uitofp_i128_to_bf16(i128 %x) { +; %cvt = uitofp i128 %x to bfloat +; ret bfloat %cvt +; } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index 1acbb09..fbf2ee1 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -60,7 +60,6 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 diff --git a/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll new file mode 100644 index 0000000..d101d8d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s + +; This looks like a partially lowered module, but the non-lowered GV isn't used by any kernels. +; In such cases, LowerModuleLDS is free to leave it in and ignore it, and we want to make sure +; LowerModuleLDS doesn't crash if it re-runs on such modules. +@notLowered = addrspace(3) global i32 poison +@lowered = addrspace(3) global i32 poison, !absolute_symbol !0 + +@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @notLowered to ptr)], section "llvm.metadata" + +define amdgpu_kernel void @kern(i32 %val0) { +; CHECK-LABEL: define amdgpu_kernel void @kern( +; CHECK-SAME: i32 [[VAL0:%.*]]) { +; CHECK-NEXT: [[VAL1:%.*]] = add i32 [[VAL0]], 4 +; CHECK-NEXT: store i32 [[VAL1]], ptr addrspace(3) @lowered, align 4 +; CHECK-NEXT: ret void +; + %val1 = add i32 %val0, 4 + store i32 %val1, ptr addrspace(3) @lowered + ret void +} + + +!0 = !{i32 0, i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll index b512a43..b1f4f2e 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll @@ -8,7 +8,7 @@ define amdgpu_kernel void @kern() { %val0 = load i32, ptr addrspace(3) @var1 %val1 = add i32 %val0, 4 - store i32 %val1, ptr addrspace(3) @var1 + store i32 %val1, ptr addrspace(3) @var2 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 5007f77..0ff5dd3 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -195,13 +195,13 @@ ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU atomic optimizations ; GCN-O1-NEXT: Expand Atomic instructions -; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Cycle Info Analysis ; GCN-O1-NEXT: Uniformity Analysis ; GCN-O1-NEXT: AMDGPU IR optimizations ; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Canonicalize natural loops ; GCN-O1-NEXT: Scalar Evolution Analysis ; GCN-O1-NEXT: Loop Pass Manager @@ -470,9 +470,9 @@ ; GCN-O1-OPTS-NEXT: Uniformity Analysis ; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations ; GCN-O1-OPTS-NEXT: Expand Atomic instructions -; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction ; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Canonicalize natural loops ; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis ; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis @@ -775,9 +775,9 @@ ; GCN-O2-NEXT: Uniformity Analysis ; GCN-O2-NEXT: AMDGPU atomic optimizations ; GCN-O2-NEXT: Expand Atomic instructions -; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction ; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O2-NEXT: Scalar Evolution Analysis ; GCN-O2-NEXT: Straight line strength reduction @@ -1084,9 +1084,9 @@ ; GCN-O3-NEXT: Uniformity Analysis ; GCN-O3-NEXT: AMDGPU atomic optimizations ; GCN-O3-NEXT: Expand Atomic instructions -; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction ; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O3-NEXT: Scalar Evolution Analysis ; GCN-O3-NEXT: Straight line strength reduction diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index b4415c1..f6197e0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -1,132 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W32 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W32 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s -declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1)) -declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1)) -declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1)) -declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1)) +declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) +declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b64: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b64: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm +; GFX12-LABEL: global_load_tr_b64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1) %gep) + %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep) store <2 x i32> %val, ptr addrspace(1) %use ret void } -define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b128_i16: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b128_i16: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm +define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1) %gep) + %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep) store <8 x i16> %val, ptr addrspace(1) %use ret void } - -define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b128_half: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b128_half: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1) %gep) - store <8 x half> %val, ptr addrspace(1) %use - ret void -} - -define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W32-LABEL: global_load_tr_b128_bfloat: -; GFX12-SDAG-W32: ; %bb.0: ; %entry -; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-SDAG-W32-NEXT: s_nop 0 -; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W32-NEXT: s_endpgm -; -; GFX12-GISEL-W32-LABEL: global_load_tr_b128_bfloat: -; GFX12-GISEL-W32: ; %bb.0: ; %entry -; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0 -; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 -; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3] -; GFX12-GISEL-W32-NEXT: s_nop 0 -; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W32-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1) %gep) - store <8 x bfloat> %val, ptr addrspace(1) %use - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index 7ad1416..a2dc366 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -1,132 +1,44 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W64 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W64 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s -declare i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1)) -declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1)) -declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1)) -declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1)) +declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1)) +declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b64: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b64: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b32 v0, v1, s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm +; GFX12-LABEL: global_load_tr_b64: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1) %gep) + %val = call i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1) %gep) store i32 %val, ptr addrspace(1) %use ret void } -define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b128_i16: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b128_i16: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm +define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) { +; GFX12-LABEL: global_load_tr_b128: +; GFX12: ; %bb.0: ; %entry +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1) %gep) + %val = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1) %gep) store <4 x i16> %val, ptr addrspace(1) %use ret void } - -define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b128_half: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b128_half: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1) %gep) - store <4 x half> %val, ptr addrspace(1) %use - ret void -} - -define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) { -; GFX12-SDAG-W64-LABEL: global_load_tr_b128_bfloat: -; GFX12-SDAG-W64: ; %bb.0: ; %entry -; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-SDAG-W64-NEXT: s_nop 0 -; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-SDAG-W64-NEXT: s_endpgm -; -; GFX12-GISEL-W64-LABEL: global_load_tr_b128_bfloat: -; GFX12-GISEL-W64: ; %bb.0: ; %entry -; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 -; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3] -; GFX12-GISEL-W64-NEXT: s_nop 0 -; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX12-GISEL-W64-NEXT: s_endpgm -entry: - %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4 - %val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1) %gep) - store <4 x bfloat> %val, ptr addrspace(1) %use - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir index 091b29c..e93595b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir @@ -4,6 +4,8 @@ --- | define amdgpu_kernel void @single-wave-phase-2b(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17, ptr addrspace(7) noalias %in18, ptr addrspace(7) noalias %in19, ptr addrspace(7) noalias %in20, ptr addrspace(7) noalias %in21, ptr addrspace(7) noalias %in22, ptr addrspace(7) noalias %in23, ptr addrspace(7) noalias %in24, ptr addrspace(7) noalias %in25, ptr addrspace(7) noalias %in26, ptr addrspace(7) noalias %in27, ptr addrspace(7) noalias %in28, ptr addrspace(7) noalias %in29) #0 { ret void } + attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } + !0 = distinct !{!0} !1 = !{!1, !0} ... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll index 1348315..7b1f55e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -22,18 +22,36 @@ main_body: define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) { ; GFX11-LABEL: load_2dmsaa_both: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:4], v[0:2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x00,0x00,0x60,0x00] -; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x05] +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02] +; GFX11-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] +; GFX11-NEXT: image_msaa_load v[0:4], v[5:7], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x05,0x00,0x60,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00] +; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x08,0x04,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2dmsaa_both: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x07] +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x05] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v9, v8 :: v_dual_mov_b32 v10, v8 ; encoding: [0x08,0x01,0x10,0xca,0x08,0x01,0x0a,0x09] +; GFX12-NEXT: v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v12, v8 ; encoding: [0x08,0x01,0x10,0xca,0x08,0x01,0x0c,0x0b] +; GFX12-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02] +; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] +; GFX12-NEXT: image_msaa_load v[0:4], [v7, v6, v5], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x07,0x06,0x05,0x00] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x08,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32 2, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) @@ -63,18 +81,37 @@ main_body: define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { ; GFX11-LABEL: load_2darraymsaa_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:4], v[0:3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x00,0x00,0x20,0x00] -; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 ; encoding: [0x80,0x00,0x10,0xca,0x03,0x01,0x08,0x09] +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, v9 ; encoding: [0x00,0x01,0x10,0xca,0x09,0x01,0x0a,0x05] +; GFX11-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] +; GFX11-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX11-NEXT: image_msaa_load v[0:4], v[5:8], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x05,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00] +; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x09,0x04,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2darraymsaa_tfe: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2, v3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] -; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX12-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v6, v2 ; encoding: [0x03,0x01,0x10,0xca,0x02,0x01,0x06,0x05] +; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v0 ; encoding: [0x01,0x01,0x10,0xca,0x00,0x01,0x08,0x07] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x23,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0a,0x0a] +; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0c,0x0c] +; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] +; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX12-NEXT: image_msaa_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x08,0x07,0x06,0x05] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x09,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32 8, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -155,18 +192,31 @@ main_body: define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) { ; GFX11-LABEL: load_2dmsaa_tfe_d16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:2], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x00,0x00,0x20,0x00] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x03] +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x05] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX11-NEXT: image_msaa_load v[0:2], v[3:5], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x03,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00] +; GFX11-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x06,0x02,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2dmsaa_tfe_d16: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x05] +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x03] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x92,0x00,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 ; encoding: [0x06,0x01,0x10,0xca,0x06,0x01,0x08,0x07] +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX12-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX12-NEXT: image_msaa_load v[0:2], [v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x05,0x04,0x03,0x00] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x06,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -196,18 +246,31 @@ main_body: define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { ; GFX11-LABEL: load_2darraymsaa_tfe_d16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:2], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x9c,0x01,0x62,0xf0,0x00,0x00,0x20,0x00] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x06] +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x04] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; encoding: [0x07,0x01,0x10,0xca,0x08,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] +; GFX11-NEXT: image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x9d,0x01,0x62,0xf0,0x06,0x00,0x20,0x00,0x05,0x04,0x03,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00] +; GFX11-NEXT: global_store_b32 v7, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x07,0x02,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2darraymsaa_tfe_d16: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] -; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x06] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x04] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x92,0x00,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 ; encoding: [0x07,0x01,0x10,0xca,0x07,0x01,0x08,0x08] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; encoding: [0x07,0x01,0x10,0xca,0x08,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX12-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] +; GFX12-NEXT: image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x06,0x05,0x04,0x03] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v7, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x07,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll index 429528e..e3dd036 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -147,6 +147,34 @@ main_body: ret half %res } +define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 { +; GCN-LABEL: v_interp_rtz_f16: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_mov_b32 s3, exec_lo +; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo +; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15 +; GCN-NEXT: s_mov_b32 exec_lo, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0 +; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7 +; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7 +; GCN-NEXT: v_add_f16_e32 v0, v3, v0 +; GCN-NEXT: ; return to shader part epilog +main_body: + %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0) + %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0) + %l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0) + %h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1) + %h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1) + %res = fadd half %l_p1, %h_p1 + ret half %res +} + define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 { ; GCN-LABEL: v_interp_f16_imm_params: ; GCN: ; %bb.0: ; %main_body @@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0 declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0 declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0 declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0 +declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0 +declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0 declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll new file mode 100644 index 0000000..fdcb177 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -0,0 +1,333 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s + +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16) + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v48, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 +; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 +; GCN-NEXT: ds_load_b128 v[8:11], v0 +; GCN-NEXT: ds_load_b128 v[12:15], v0 offset:512 +; GCN-NEXT: ds_load_b128 v[16:19], v0 offset:1536 +; GCN-NEXT: ds_load_b128 v[20:23], v0 offset:3072 +; GCN-NEXT: ds_load_b128 v[24:27], v0 offset:5120 +; GCN-NEXT: ds_load_b128 v[4:7], v0 offset:11280 +; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264 +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x6 +; GCN-NEXT: v_mov_b32_e32 v31, v11 +; GCN-NEXT: s_wait_dscnt 0x5 +; GCN-NEXT: v_mov_b32_e32 v35, v15 +; GCN-NEXT: s_wait_dscnt 0x4 +; GCN-NEXT: v_mov_b32_e32 v39, v19 +; GCN-NEXT: s_wait_dscnt 0x3 +; GCN-NEXT: v_mov_b32_e32 v43, v23 +; GCN-NEXT: s_wait_dscnt 0x2 +; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 +; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 +; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 +; GCN-NEXT: v_mov_b32_e32 v32, v12 +; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 +; GCN-NEXT: v_mov_b32_e32 v36, v16 +; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 +; GCN-NEXT: v_mov_b32_e32 v40, v20 +; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 +; GCN-NEXT: v_mov_b32_e32 v44, v24 +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 +; GCN-NEXT: ds_store_b128 v49, v[28:31] +; GCN-NEXT: ds_store_b128 v50, v[32:35] offset:512 +; GCN-NEXT: ds_store_b128 v50, v[36:39] offset:1024 +; GCN-NEXT: ds_store_b128 v50, v[40:43] offset:1536 +; GCN-NEXT: ds_store_b128 v50, v[44:47] offset:2048 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) +; GCN-NEXT: s_endpgm +; +; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: +; EXACTCUTOFF: ; %bb.0: ; %entry +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 +; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) +; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 +; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 +; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v0 offset:512 +; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v0 offset:1536 +; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v0 offset:3072 +; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v0 offset:5120 +; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v0 offset:11280 +; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48 +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48 +; EXACTCUTOFF-NEXT: ds_store_b128 v49, v[28:31] +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[32:35] offset:512 +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[36:39] offset:1024 +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[40:43] offset:1536 +; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[44:47] offset:2048 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0) +; EXACTCUTOFF-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx + %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32 + %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64 + %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96 + %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128 + %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr + %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192 + %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr + %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) + %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) + %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) + %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) + %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) + %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx + store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 + store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 + store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 + store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 + store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr + ; 7 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0) + ; 5 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0) + ; 5 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0) + ret void +} + +define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { +; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v18, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; GCN-NEXT: ds_load_b128 v[1:4], v17 +; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16 +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x2 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; GCN-NEXT: s_wait_dscnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048 +; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; GCN-NEXT: s_endpgm +; +; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: +; EXACTCUTOFF: ; %bb.0: ; %entry +; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 +; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17 +; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536 +; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9 +; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) +; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048 +; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) +; EXACTCUTOFF-NEXT: s_endpgm +entry: + %idx = call i32 @llvm.amdgcn.workitem.id.x() + %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx + %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr + %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64 + %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr + %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96 + %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr + %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128 + %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr + %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160 + %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr + %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192 + %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr + %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0) + %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0) + %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0) + %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0) + %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0) + %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx + store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr + %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32 + store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr + %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64 + store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr + %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96 + store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr + %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128 + store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr + ; 3 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ; 1 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ; 1 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ; 1 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ; 1 DS read + call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) + ; 1 SWMMAC + call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0) + ; 1 DS write + call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll index 00be32b..ba3d306 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s +;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-enable-prt-strict-null -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s ;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s @@ -34,6 +35,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_clause 0x2 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v8, 0 @@ -75,6 +86,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_immoffs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_immoffs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -146,6 +164,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { ; GFX11-NEXT: v_add_f32_e32 v2, v10, v2 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_immoffs_large: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_movk_i32 s4, 0x7ffc +; NOPRT-NEXT: s_clause 0x1 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092 +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092 +; NOPRT-NEXT: s_mov_b32 s4, 0x8ffc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_add_f32_e32 v1, v1, v5 +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4 +; NOPRT-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1 +; NOPRT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; NOPRT-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 +; NOPRT-NEXT: v_add_f32_e32 v2, v10, v2 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_immoffs_large: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v8, 0 @@ -196,6 +233,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_12bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_12bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -235,6 +279,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_13bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_13bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -274,6 +327,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_16bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_16bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -313,6 +375,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_23bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_23bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -352,6 +423,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_24bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-SDAG-LABEL: buffer_load_voffset_large_24bit: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0 @@ -389,6 +469,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_idx: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_idx: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen @@ -427,6 +513,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_ofs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_ofs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 @@ -466,6 +561,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_ofs_imm: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_ofs_imm: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 @@ -497,6 +601,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_both: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_both: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen @@ -529,6 +639,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_both_reversed: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v2, v0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_both_reversed: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v2, v0 @@ -562,6 +679,13 @@ define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_x: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_x: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -595,6 +719,13 @@ define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_x_i32: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_x_i32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -629,6 +760,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_xy: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_xy: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -644,7 +782,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v4i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 -; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; GFX6-NEXT: v_mov_b32_e32 v7, 2 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s0, s2 @@ -658,7 +801,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v4i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 -; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; GFX8PLUS-NEXT: v_mov_b32_e32 v7, 2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 +; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8PLUS-NEXT: v_mov_b32_e32 v0, v6 @@ -667,22 +815,40 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; ; GFX11-LABEL: buffer_load_v4i32_tfe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v4i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v2, 2 +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v4i32_tfe: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2 +; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: ; return to shader part epilog - %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 2, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %load, 0 store <4 x i32> %data, ptr addrspace(1) %out %status = extractvalue { <4 x i32>, i32 } %load, 1 @@ -694,6 +860,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v4f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -708,6 +878,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v4f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -718,15 +892,32 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v4f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 ; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v4f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v4f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2 ; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off @@ -744,6 +935,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v3i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -759,6 +953,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v3i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -769,15 +966,31 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v3i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v3i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v3i32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off @@ -795,6 +1008,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v3f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -810,6 +1026,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v3f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -820,15 +1039,31 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v3f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v3f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v3f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off @@ -846,6 +1081,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v2i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -860,6 +1098,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v2i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -870,15 +1110,29 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v2i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v2i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v2i32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -896,6 +1150,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v2f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -910,6 +1167,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v2f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -920,15 +1179,29 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v2f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v2f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v2f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -946,6 +1219,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX6-LABEL: buffer_load_i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -960,6 +1234,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX8PLUS-LABEL: buffer_load_i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -970,15 +1245,28 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX11-LABEL: buffer_load_i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_i32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off @@ -996,6 +1284,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX6-LABEL: buffer_load_f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -1010,6 +1299,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX8PLUS-LABEL: buffer_load_f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -1020,15 +1310,28 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX11-LABEL: buffer_load_f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll index b0bd4e4..c5202b8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s +;RUN: llc < %s -mtriple=amdgcn -mattr=-enable-prt-strict-null -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load: @@ -31,6 +32,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrsp ; GFX11-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_clause 0x2 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) %data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) @@ -62,6 +73,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_immoffs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret <4 x float> %data @@ -126,6 +144,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) ; GFX11-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 ; GFX11-NEXT: v_add_f32_e32 v2, v10, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_immoffs_large: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_movk_i32 s4, 0x7ffc +; NOPRT-NEXT: s_clause 0x1 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092 +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092 +; NOPRT-NEXT: s_mov_b32 s4, 0x8ffc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_add_f32_e32 v1, v1, v5 +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4 +; NOPRT-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1 +; NOPRT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; NOPRT-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 +; NOPRT-NEXT: v_add_f32_e32 v2, v10, v2 +; NOPRT-NEXT: ; return to shader part epilog main_body: %d.0 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 60, i32 0) %d.1 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 32764, i32 0) @@ -156,6 +193,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_12bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 0, i32 0) ret <4 x float> %data @@ -188,6 +232,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_13bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8188, i32 0, i32 0) ret <4 x float> %data @@ -220,6 +273,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_16bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 65532, i32 0, i32 0) ret <4 x float> %data @@ -252,6 +314,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_23bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8388604, i32 0, i32 0) ret <4 x float> %data @@ -284,6 +355,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_24bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 16777212, i32 0, i32 0) ret <4 x float> %data @@ -307,6 +387,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_idx: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0) ret <4 x float> %data @@ -339,6 +425,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_ofs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0) ret <4 x float> %data @@ -371,6 +466,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_ofs_imm: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %1, 60 %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0) @@ -395,6 +499,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_both: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0) ret <4 x float> %data @@ -421,6 +531,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_both_reversed: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v2, v0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0) ret <4 x float> %data @@ -447,6 +564,13 @@ define amdgpu_ps float @buffer_load_x(ptr addrspace(8) inreg %rsrc) { ; GFX11-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_x: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %data @@ -473,6 +597,13 @@ define amdgpu_ps float @buffer_load_x_i32(ptr addrspace(8) inreg %rsrc) { ; GFX11-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_x_i32: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call i32 @llvm.amdgcn.struct.ptr.buffer.load.format.i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %fdata = bitcast i32 %data to float @@ -500,6 +631,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) { ; GFX11-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_xy: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret <2 x float> %data @@ -509,6 +647,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v4i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -523,6 +665,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v4i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -533,11 +679,25 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v4i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 ; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v4i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %load, 0 store <4 x i32> %data, ptr addrspace(1) %out @@ -550,6 +710,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v4f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -564,6 +728,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v4f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -574,11 +742,25 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v4f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 ; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v4f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x float>, i32 } %load, 0 store <4 x float> %data, ptr addrspace(1) %out @@ -591,6 +773,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v3i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -606,6 +791,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v3i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -616,11 +804,24 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v3i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v3i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x i32>, i32 } %load, 0 store <3 x i32> %data, ptr addrspace(1) %out @@ -633,6 +834,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v3f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -648,6 +852,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v3f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -658,11 +865,24 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v3f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v3f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <3 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x float>, i32 } %load, 0 store <3 x float> %data, ptr addrspace(1) %out @@ -675,6 +895,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v2i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -689,6 +912,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v2i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -699,11 +924,23 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v2i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v2i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x i32>, i32 } %load, 0 store <2 x i32> %data, ptr addrspace(1) %out @@ -716,6 +953,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v2f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -730,6 +970,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v2f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -740,11 +982,23 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v2f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v2f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <2 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x float>, i32 } %load, 0 store <2 x float> %data, ptr addrspace(1) %out @@ -757,6 +1011,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX6-LABEL: buffer_load_i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -771,6 +1026,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX8PLUS-LABEL: buffer_load_i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -781,11 +1037,22 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX11-LABEL: buffer_load_i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog %load = call { i32, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i32, i32 } %load, 0 store i32 %data, ptr addrspace(1) %out @@ -798,6 +1065,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX6-LABEL: buffer_load_f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -812,6 +1080,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX8PLUS-LABEL: buffer_load_f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -822,11 +1091,22 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX11-LABEL: buffer_load_f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog %load = call { float, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { float, i32 } %load, 0 store float %data, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index ab7ab4d..d056a97 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -32,8 +32,6 @@ define amdgpu_kernel void @maxnum_f16( ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -170,7 +168,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -279,7 +276,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -384,21 +380,17 @@ define amdgpu_kernel void @maxnum_v2f16( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 ; SI-NEXT: v_max_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -497,20 +489,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -589,20 +579,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_max_f32_e32 v0, 4.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_max_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -688,27 +676,21 @@ define amdgpu_kernel void @maxnum_v3f16( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: s_lshr_b32 s8, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_max_f32_e32 v2, v3, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, v1, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_max_f32_e32 v0, v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_max_f32_e32 v1, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_max_f32_e32 v2, v3, v4 +; SI-NEXT: v_max_f32_e32 v0, v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -837,25 +819,17 @@ define amdgpu_kernel void @maxnum_v4f16( ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_max_f32_e32 v3, v3, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_max_f32_e32 v1, v1, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, v2, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_max_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_max_f32_e32 v1, v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v0, v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -986,20 +960,16 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: v_max_f32_e32 v2, 4.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_max_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index b7370ce..f934a2d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -32,8 +32,6 @@ define amdgpu_kernel void @minnum_f16_ieee( ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -197,7 +195,6 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -305,7 +302,6 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 @@ -409,21 +405,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s1, s2, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 -; SI-NEXT: s_lshr_b32 s0, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_lshr_b32 s3, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 ; SI-NEXT: v_min_f32_e32 v0, v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -556,20 +548,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_a( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -647,20 +637,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_b( ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dword s2, s[2:3], 0x0 -; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 +; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_min_f32_e32 v0, 4.0, v0 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_min_f32_e32 v1, 4.0, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -745,27 +733,21 @@ define amdgpu_kernel void @minnum_v3f16( ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 -; SI-NEXT: s_lshr_b32 s2, s2, 16 -; SI-NEXT: s_lshr_b32 s3, s0, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s3 +; SI-NEXT: s_lshr_b32 s3, s2, 16 +; SI-NEXT: s_lshr_b32 s8, s0, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_min_f32_e32 v2, v3, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, v1, v3 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_min_f32_e32 v0, v0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_min_f32_e32 v1, v1, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_min_f32_e32 v2, v3, v4 +; SI-NEXT: v_min_f32_e32 v0, v0, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -893,25 +875,17 @@ define amdgpu_kernel void @minnum_v4f16( ; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 ; SI-NEXT: s_lshr_b32 s6, s7, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: s_lshr_b32 s6, s5, 16 +; SI-NEXT: s_lshr_b32 s4, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 -; SI-NEXT: s_lshr_b32 s4, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, s4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, s5 ; SI-NEXT: v_min_f32_e32 v3, v3, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_min_f32_e32 v1, v1, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_min_f32_e32 v2, v2, v5 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_min_f32_e32 v2, v2, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_min_f32_e32 v1, v1, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v0, v0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -1041,20 +1015,16 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: s_lshr_b32 s5, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1 +; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 +; SI-NEXT: v_min_f32_e32 v2, 4.0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_min_f32_e32 v3, 2.0, v3 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll new file mode 100644 index 0000000..f1d9463 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll @@ -0,0 +1,47 @@ + +; Default O0 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O0 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O1 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O1 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O2 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O2 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Default O3 +; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; Unified O3 +; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc +; RUN: llvm-lto2 run -unified-lto=full -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s + +; First print will be from the New PM during the full LTO pipeline. +; Second print will be from the legacy PM during the CG pipeline. + +; CHECK: Running pass: AMDGPULowerModuleLDSPass on [module] +; CHECK: ModulePass Manager +; CHECK: Lower uses of LDS variables from non-kernel functions + +@lds = internal unnamed_addr addrspace(3) global i32 poison, align 4 + +define amdgpu_kernel void @test() { +entry: + store i32 1, ptr addrspace(3) @lds + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index fb3e79b..5b7f0e7 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -951,56 +951,70 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX1100: ; %bb.0: ; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v1, 0 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v6, 0 +; SDAG-GFX1100-NEXT: v_pk_max_f16 v2, v0, 0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v2, 1.0 op_sel_hi:[1,0] ; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX900: ; %bb.0: ; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, 0 +; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-GFX906: ; %bb.0: ; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp ; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0 -; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp -; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, 0 +; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] ; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7 ; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp ; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v8 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v5 +; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v1 +; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v2 +; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v3 +; SDAG-VI-NEXT: v_min_f16_e32 v1, 1.0, v1 ; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1139,63 +1153,80 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s } define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100: ; %bb.0: +; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v7, 0 +; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX900-NEXT: v_mov_b32_e32 v0, v6 -; GFX900-NEXT: v_mov_b32_e32 v1, v2 -; GFX900-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v7, 0 +; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX906-NEXT: v_mov_b32_e32 v0, v6 -; GFX906-NEXT: v_mov_b32_e32 v1, v2 -; GFX906-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX906: ; %bb.0: +; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v7, 0 +; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 +; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-VI-NEXT: v_mac_f32_e32 v10, v7, v9 ; SDAG-VI-NEXT: v_mac_f32_e32 v11, v6, v8 -; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 ; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp -; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v10 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0 +; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v1 +; SDAG-VI-NEXT: v_max_f16_e32 v2, 0, v2 +; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v3 +; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0x3c00 +; SDAG-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_min_f16_e32 v3, 1.0, v3 +; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v2 ; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-VI-NEXT: v_or_b32_e32 v1, v3, v1 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] @@ -1241,6 +1272,40 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v2 +; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX906: ; %bb.0: +; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v6 +; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, v2 +; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir b/llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir new file mode 100644 index 0000000..d7f5d1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir @@ -0,0 +1,1154 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GFX12 %s + +--- +name: buffer_load_dword_dwordx3 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx3 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx3_dword +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx3_dword + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dword +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dword + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub0_sub1 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + + +name: buffer_load_dword_dword +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_32 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_32 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub3 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub0_sub1 + ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub2 + ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0 + ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %10:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %11:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 24, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %12:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 28, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %13:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %14:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 40, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %15:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 44, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +# +# buffer_store_dword +# + +name: buffer_store_dword_xyz +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dword_xyz + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3 + ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact %14:vreg_96, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx3_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dwordx3_dword + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3 + ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 + BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact %14:vreg_96, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx2_dwordx2 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dwordx2_dwordx2 + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 + ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %14:vreg_64, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %15:vreg_64, %13:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_dwordx2 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dword_dwordx2 + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2 + ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %15:vreg_64, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx2_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dwordx2_dword + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2 + ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %14:vreg_64, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-LABEL: name: buffer_store_dword_dword + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %6:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_32 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX12-LABEL: name: buffer_store_dword_32 + ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2 + ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX12-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2 + ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %12:vgpr_32 = COPY $vgpr8 + %11:vgpr_32 = COPY $vgpr7 + %10:vgpr_32 = COPY $vgpr6 + %9:vgpr_32 = COPY $vgpr5 + %8:vgpr_32 = COPY $vgpr4 + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %4:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %5:vgpr_32, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %6:vgpr_32, %13:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %8:vgpr_32, %13:sgpr_128, $sgpr_null, 24, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %9:vgpr_32, %13:sgpr_128, $sgpr_null, 28, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %10:vgpr_32, %13:sgpr_128, $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %11:vgpr_32, %13:sgpr_128, $sgpr_null, 40, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %12:vgpr_32, %13:sgpr_128, $sgpr_null, 44, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merged_swizzled_0 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_not_merged_swizzled_0 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merged_swizzled_1 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_not_merged_swizzled_1 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_merge_across_swizzle +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_merge_across_swizzle + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %4:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %6:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %4:sgpr_128, $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %4:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merge_across_swizzled_store +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_not_merge_across_swizzled_store + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %6:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %4:vgpr_32, %5:sgpr_128, $sgpr_null, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_merge_across_swizzled_store +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_merge_across_swizzled_store + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1 + ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %6:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %4:vgpr_32, %5:sgpr_128, $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_idxen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_idxen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_idxen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2_idxen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_idxen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx3_idxen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_bothen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_bothen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_bothen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2_bothen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_bothen +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx3_bothen + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx3_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_idxen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_dword_idxen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_bothen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_bothen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dwordx2_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact +body: | + bb.0.entry: + + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_bothen_exact +body: | + bb.0.entry: + + ; GFX12-LABEL: name: buffer_load_dword_dwordx3_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_bothen_exact +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_dword_bothen_exact + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub0_sub1 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub2 + ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 + ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0 +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0 + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub0 + ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:vreg_64 = COPY $vgpr1 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %5, %6:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_32 = COPY $sgpr4 + %5:vreg_64 = COPY $vgpr0 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %5, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %5, %7:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:vgpr_32 = COPY $vgpr1 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %5, %6:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc +body: | + bb.0.entry: + ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc + ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_32 = COPY $sgpr4 + %5:vgpr_32 = COPY $vgpr0 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %5, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %5, %7:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... diff --git a/llvm/test/CodeGen/AMDGPU/merge-buffer.mir b/llvm/test/CodeGen/AMDGPU/merge-buffer.mir new file mode 100644 index 0000000..1c6d429 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/merge-buffer.mir @@ -0,0 +1,1130 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN %s + +--- +name: buffer_load_dword_dwordx3 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx3 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx3_dword +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx3_dword + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_OFFSET]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dword +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dword + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0_sub1 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_OFFSET]].sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + + +name: buffer_load_dword_dword +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_32 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_32 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1_sub2 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2 + ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0 + ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], 0, 36, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0_sub1 + ; GCN-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_OFFSET]].sub2 + ; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0 + ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %11:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 24, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %12:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 28, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %13:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 36, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %14:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 40, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 44, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +# +# buffer_store_dword +# + +name: buffer_store_dword_xyz +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dword_xyz + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORDX3_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx3_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dwordx3_dword + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2 + BUFFER_STORE_DWORDX3_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx2_dwordx2 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dwordx2_dwordx2 + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORDX2_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + BUFFER_STORE_DWORDX2_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_dwordx2 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dword_dwordx2 + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2 + ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORDX2_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) +... +--- + +name: buffer_store_dwordx2_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dwordx2_dword + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2 + ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1 + BUFFER_STORE_DWORDX2_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_dword +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-LABEL: name: buffer_store_dword_dword + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; GCN-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_store_dword_32 +body: | + bb.0.entry: + liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GCN-LABEL: name: buffer_store_dword_32 + ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1 + ; GCN-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4) + ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2 + ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4) + ; GCN-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GCN-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2 + ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4) + %12:vgpr_32 = COPY $vgpr8 + %11:vgpr_32 = COPY $vgpr7 + %10:vgpr_32 = COPY $vgpr6 + %9:vgpr_32 = COPY $vgpr5 + %8:vgpr_32 = COPY $vgpr4 + %7:vgpr_32 = COPY $vgpr3 + %6:vgpr_32 = COPY $vgpr2 + %5:vgpr_32 = COPY $vgpr1 + %4:vgpr_32 = COPY $vgpr0 + %3:sgpr_32 = COPY $sgpr3 + %2:sgpr_32 = COPY $sgpr2 + %1:sgpr_32 = COPY $sgpr1 + %0:sgpr_32 = COPY $sgpr0 + %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + BUFFER_STORE_DWORD_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merged_swizzled_0 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_not_merged_swizzled_0 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_not_merged_swizzled_1 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_not_merged_swizzled_1 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_merge_across_swizzle +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_merge_across_swizzle + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %4:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %4:sgpr_128, 0, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %4:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_merge_across_swizzled_store +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_merge_across_swizzled_store + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFSET_exact %4:vgpr_32, %5:sgpr_128, 0, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4) + %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_idxen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_idxen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_IDXEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_IDXEN]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_idxen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2_idxen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_IDXEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_IDXEN]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_idxen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx3_idxen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_bothen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_bothen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_bothen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2_bothen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_bothen +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx3_bothen + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx3_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_idxen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_dword_idxen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub2 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_IDXEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_bothen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx2_bothen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dwordx2_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub1_sub2 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact +body: | + bb.0.entry: + + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dwordx3_bothen_exact +body: | + bb.0.entry: + + ; GCN-LABEL: name: buffer_load_dword_dwordx3_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub1_sub2_sub3 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_bothen_exact +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_dword_bothen_exact + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub0_sub1 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub2 + ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0 + ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0 +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0 + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4) + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub0 + ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub1 + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vreg_64 = COPY $vgpr0 + %5:vreg_64 = COPY $vgpr1 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %5, %6:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_32 = COPY $sgpr4 + %5:vreg_64 = COPY $vgpr0 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %5, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %5, %7:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:vgpr_32 = COPY $vgpr0 + %5:vgpr_32 = COPY $vgpr1 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %5, %6:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... +--- + +name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc +body: | + bb.0.entry: + ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc + ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3 + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %0:sgpr_32 = COPY $sgpr0 + %1:sgpr_32 = COPY $sgpr1 + %2:sgpr_32 = COPY $sgpr2 + %3:sgpr_32 = COPY $sgpr3 + %4:sgpr_32 = COPY $sgpr4 + %5:vgpr_32 = COPY $vgpr0 + %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3 + %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3 + %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %5, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + %9:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %5, %7:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) +... diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir index c86b5ad..9766b42 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir @@ -7,9 +7,37 @@ # GFX9 tests # +--- name: gfx9_tbuffer_load_x_xyz body: | bb.0.entry: + ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz + ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4) + ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0 + ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3 + ; + ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz + ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) + ; + ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz + ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 + ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4) + ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4) %0:sgpr_32 = COPY $sgpr0 %1:sgpr_32 = COPY $sgpr1 %2:sgpr_32 = COPY $sgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll index cbdc7bb..69971bc 100644 --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,7 +27,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s24, s33 +; CHECK-NEXT: s_mov_b32 s18, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -43,7 +43,6 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; clobber csr v40 @@ -55,7 +54,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s24 +; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -88,7 +87,6 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm bb: @@ -148,7 +146,6 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm bb: @@ -173,7 +170,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s24, s33 +; CHECK-NEXT: s_mov_b32 s18, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill @@ -188,7 +185,6 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v1, 1 ; CHECK-NEXT: v_readlane_b32 s30, v1, 0 @@ -196,7 +192,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s24 +; CHECK-NEXT: s_mov_b32 s33, s18 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -208,7 +204,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s25, s33 +; CHECK-NEXT: s_mov_b32 s19, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -223,7 +219,6 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_readlane_b32 s31, v2, 1 ; CHECK-NEXT: v_readlane_b32 s30, v2, 0 @@ -231,7 +226,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s25 +; CHECK-NEXT: s_mov_b32 s33, s19 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -263,7 +258,6 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir index 3de258b..bf2cf6a 100644 --- a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir +++ b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir @@ -5,6 +5,14 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-padding-ratio=75 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx908-PAD75 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-padding-ratio=100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx908-PAD100 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx90a-DEFAULT %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-padding-ratio=50 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx90a-PAD50 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-padding-ratio=100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx90a-PAD100 %s + +# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx940-DEFAULT %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-mfma-padding-ratio=50 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx940-PAD50 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-mfma-padding-ratio=100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx940-PAD100 %s + --- name: mfma_padding_2_pass body: | @@ -31,6 +39,35 @@ body: | ; gfx908-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec ; gfx908-PAD100-NEXT: S_NOP 1 ; gfx908-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_2_pass + ; gfx90a-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_2_pass + ; gfx90a-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 0 + ; gfx90a-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_2_pass + ; gfx90a-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 1 + ; gfx90a-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_2_pass + ; gfx940-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: S_NOP 1 + ; gfx940-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_2_pass + ; gfx940-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 1 + ; gfx940-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_2_pass + ; gfx940-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 1 + ; gfx940-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec ... @@ -64,6 +101,40 @@ body: | ; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: S_NOP 0 ; gfx908-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx90a-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx90a-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx90a-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 0 + ; gfx90a-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx940-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: S_NOP 0 + ; gfx940-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx940-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 0 + ; gfx940-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_2_pass_1_intervening_valu + ; gfx940-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 0 + ; gfx940-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $vgpr2 = V_MOV_B32_e32 1, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec @@ -100,6 +171,41 @@ body: | ; gfx908-PAD100-NEXT: DBG_VALUE ; gfx908-PAD100-NEXT: S_NOP 1 ; gfx908-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_2_pass_dbg + ; gfx90a-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: DBG_VALUE + ; gfx90a-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_2_pass_dbg + ; gfx90a-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: DBG_VALUE + ; gfx90a-PAD50-NEXT: S_NOP 0 + ; gfx90a-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_2_pass_dbg + ; gfx90a-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: DBG_VALUE + ; gfx90a-PAD100-NEXT: S_NOP 1 + ; gfx90a-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_2_pass_dbg + ; gfx940-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: DBG_VALUE + ; gfx940-DEFAULT-NEXT: S_NOP 1 + ; gfx940-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_2_pass_dbg + ; gfx940-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: DBG_VALUE + ; gfx940-PAD50-NEXT: S_NOP 1 + ; gfx940-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_2_pass_dbg + ; gfx940-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: DBG_VALUE + ; gfx940-PAD100-NEXT: S_NOP 1 + ; gfx940-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec DBG_VALUE $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec @@ -132,6 +238,34 @@ body: | ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_8_pass + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_8_pass + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 3 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_8_pass + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_8_pass + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_8_pass + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 3 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_8_pass + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ... @@ -172,6 +306,46 @@ body: | ; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: S_NOP 5 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 1 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 5 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 1 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_8_pass_2_intervening_valu + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 5 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $vgpr2 = V_MOV_B32_e32 1, implicit $exec $vgpr3 = V_MOV_B32_e32 1, implicit $exec @@ -207,6 +381,36 @@ body: | ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 7 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 7 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ... @@ -258,6 +462,60 @@ body: | ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: S_NOP 3 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 3 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: S_NOP 3 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 3 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_4_intervening_valu + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: S_NOP 3 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $vgpr2 = V_MOV_B32_e32 1, implicit $exec $vgpr3 = V_MOV_B32_e32 1, implicit $exec @@ -369,6 +627,126 @@ body: | ; gfx908-PAD100-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_16_intervening_valu + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $vgpr2 = V_MOV_B32_e32 1, implicit $exec $vgpr3 = V_MOV_B32_e32 1, implicit $exec @@ -414,6 +792,30 @@ body: | ; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_occ_1 ; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_occ_1 + ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec ... @@ -506,6 +908,108 @@ body: | ; gfx908-PAD100-NEXT: S_NOP 7 ; gfx908-PAD100-NEXT: S_NOP 5 ; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx90a-DEFAULT: bb.0: + ; gfx90a-DEFAULT-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx90a-DEFAULT-NEXT: {{ $}} + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-DEFAULT-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx90a-DEFAULT-NEXT: {{ $}} + ; gfx90a-DEFAULT-NEXT: bb.1: + ; gfx90a-DEFAULT-NEXT: successors: %bb.2(0x80000000) + ; gfx90a-DEFAULT-NEXT: {{ $}} + ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: {{ $}} + ; gfx90a-DEFAULT-NEXT: bb.2: + ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx90a-PAD50: bb.0: + ; gfx90a-PAD50-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx90a-PAD50-NEXT: {{ $}} + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD50-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx90a-PAD50-NEXT: {{ $}} + ; gfx90a-PAD50-NEXT: bb.1: + ; gfx90a-PAD50-NEXT: successors: %bb.2(0x80000000) + ; gfx90a-PAD50-NEXT: {{ $}} + ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: {{ $}} + ; gfx90a-PAD50-NEXT: bb.2: + ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD50-NEXT: S_NOP 5 + ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx90a-PAD100: bb.0: + ; gfx90a-PAD100-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx90a-PAD100-NEXT: {{ $}} + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx90a-PAD100-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx90a-PAD100-NEXT: {{ $}} + ; gfx90a-PAD100-NEXT: bb.1: + ; gfx90a-PAD100-NEXT: successors: %bb.2(0x80000000) + ; gfx90a-PAD100-NEXT: {{ $}} + ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: {{ $}} + ; gfx90a-PAD100-NEXT: bb.2: + ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx90a-PAD100-NEXT: S_NOP 7 + ; gfx90a-PAD100-NEXT: S_NOP 5 + ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx940-DEFAULT: bb.0: + ; gfx940-DEFAULT-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx940-DEFAULT-NEXT: {{ $}} + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-DEFAULT-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx940-DEFAULT-NEXT: {{ $}} + ; gfx940-DEFAULT-NEXT: bb.1: + ; gfx940-DEFAULT-NEXT: successors: %bb.2(0x80000000) + ; gfx940-DEFAULT-NEXT: {{ $}} + ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: {{ $}} + ; gfx940-DEFAULT-NEXT: bb.2: + ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx940-PAD50: bb.0: + ; gfx940-PAD50-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx940-PAD50-NEXT: {{ $}} + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD50-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx940-PAD50-NEXT: {{ $}} + ; gfx940-PAD50-NEXT: bb.1: + ; gfx940-PAD50-NEXT: successors: %bb.2(0x80000000) + ; gfx940-PAD50-NEXT: {{ $}} + ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: {{ $}} + ; gfx940-PAD50-NEXT: bb.2: + ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD50-NEXT: S_NOP 5 + ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; + ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_2_preds + ; gfx940-PAD100: bb.0: + ; gfx940-PAD100-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; gfx940-PAD100-NEXT: {{ $}} + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec + ; gfx940-PAD100-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc + ; gfx940-PAD100-NEXT: {{ $}} + ; gfx940-PAD100-NEXT: bb.1: + ; gfx940-PAD100-NEXT: successors: %bb.2(0x80000000) + ; gfx940-PAD100-NEXT: {{ $}} + ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: {{ $}} + ; gfx940-PAD100-NEXT: bb.2: + ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec + ; gfx940-PAD100-NEXT: S_NOP 7 + ; gfx940-PAD100-NEXT: S_NOP 5 + ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec bb.0: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec S_CBRANCH_VCCZ %bb.2, implicit undef $vcc diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll index 34e67d0..9999cb9 100644 --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -32,7 +32,6 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: .Ltmp1: diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll new file mode 100644 index 0000000..538ce15 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll @@ -0,0 +1,305 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s + +; CHECK: .amdgpu_pal_metadata +; CHECK-NEXT: --- +; CHECK-NEXT: amdpal.pipelines: +; CHECK-NEXT: - .api: Vulkan +; CHECK-NEXT: .compute_registers: +; CHECK-NEXT: .tg_size_en: true +; CHECK-NEXT: .tgid_x_en: false +; CHECK-NEXT: .tgid_y_en: false +; CHECK-NEXT: .tgid_z_en: false +; CHECK-NEXT: .tidig_comp_cnt: 0x1 +; CHECK-NEXT: .hardware_stages: +; CHECK-NEXT: .cs: +; CHECK-NEXT: .checksum_value: 0x9444d7d0 +; CHECK-NEXT: .debug_mode: 0 +; CHECK-NEXT: .excp_en: 0 +; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .ieee_mode: true +; CHECK-NEXT: .image_op: false +; CHECK-NEXT: .lds_size: 0x200 +; CHECK-NEXT: .mem_ordered: true +; CHECK-NEXT: .sgpr_limit: 0x6a +; CHECK-NEXT: .threadgroup_dimensions: +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: - 0x400 +; CHECK-NEXT: - 0x1 +; CHECK-NEXT: .trap_present: false +; CHECK-NEXT: .user_data_reg_map: +; CHECK-NEXT: - 0x10000000 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0 +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: - 0xffffffff +; CHECK-NEXT: .user_sgprs: 0x3 +; CHECK-NEXT: .vgpr_limit: 0x100 +; CHECK-NEXT: .wavefront_size: 0x40 +; CHECK-NEXT: .wgp_mode: true +; CHECK: .registers: {} +; CHECK-NEXT: .shader_functions: +; CHECK-NEXT: dynamic_stack: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x2 +; CHECK-NEXT: dynamic_stack_loop: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x3 +; CHECK-NEXT: multiple_stack: +; CHECK-NEXT: .backend_stack_size: 0x24 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x21 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24 +; CHECK-NEXT: .vgpr_count: 0x3 +; CHECK-NEXT: no_stack: +; CHECK-NEXT: .backend_stack_size: 0 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x20 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: no_stack_call: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x3 +; CHECK-NEXT: no_stack_extern_call: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: no_stack_extern_call_many_args: +; CHECK-NEXT: .backend_stack_size: 0x90 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x90 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: no_stack_indirect_call: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: simple_lds: +; CHECK-NEXT: .backend_stack_size: 0 +; CHECK-NEXT: .lds_size: 0x100 +; CHECK-NEXT: .sgpr_count: 0x20 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0 +; CHECK-NEXT: .vgpr_count: 0x1 +; CHECK-NEXT: simple_lds_recurse: +; CHECK-NEXT: .backend_stack_size: 0x10 +; CHECK-NEXT: .lds_size: 0x100 +; CHECK-NEXT: .sgpr_count: 0x24 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10 +; CHECK-NEXT: .vgpr_count: 0x29 +; CHECK-NEXT: simple_stack: +; CHECK-NEXT: .backend_stack_size: 0x14 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x21 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14 +; CHECK-NEXT: .vgpr_count: 0x2 +; CHECK-NEXT: simple_stack_call: +; CHECK-NEXT: .backend_stack_size: 0x20 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x22 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20 +; CHECK-NEXT: .vgpr_count: 0x4 +; CHECK-NEXT: simple_stack_extern_call: +; CHECK-NEXT: .backend_stack_size: 0x20 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: simple_stack_indirect_call: +; CHECK-NEXT: .backend_stack_size: 0x20 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x29 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20 +; CHECK-NEXT: .vgpr_count: 0x58 +; CHECK-NEXT: simple_stack_recurse: +; CHECK-NEXT: .backend_stack_size: 0x20 +; CHECK-NEXT: .lds_size: 0 +; CHECK-NEXT: .sgpr_count: 0x24 +; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20 +; CHECK-NEXT: .vgpr_count: 0x2a +; CHECK:amdpal.version: +; CHECK-NEXT: - 0x3 +; CHECK-NEXT: - 0 +; CHECK-NEXT:... +; CHECK-NEXT: .end_amdgpu_pal_metadata + +declare amdgpu_gfx float @extern_func(float) #0 +declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0 + +@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4 + +define amdgpu_gfx float @no_stack(float %arg0) #0 { + %add = fadd float %arg0, 1.0 + ret float %add +} + +define amdgpu_gfx float @simple_stack(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %add = fadd float %arg0, %val + ret float %add +} + +define amdgpu_gfx float @multiple_stack(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %add = fadd float %arg0, %val + %stack2 = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack2 + %val2 = load volatile float, ptr addrspace(5) %stack2 + %add2 = fadd float %add, %val2 + ret float %add2 +} + +define amdgpu_gfx float @dynamic_stack(float %arg0) #0 { +bb0: + %cmp = fcmp ogt float %arg0, 0.0 + br i1 %cmp, label %bb1, label %bb2 + +bb1: + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %add = fadd float %arg0, %val + br label %bb2 + +bb2: + %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ] + ret float %res +} + +define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 { +bb0: + br label %bb1 + +bb1: + %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ] + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %add = fadd float %arg0, %val + %cmp = icmp sgt i32 %ctr, 0 + %newctr = sub i32 %ctr, 1 + br i1 %cmp, label %bb1, label %bb2 + +bb2: + ret float %add +} + +define amdgpu_gfx float @no_stack_call(float %arg0) #0 { + %res = call amdgpu_gfx float @simple_stack(float %arg0) + ret float %res +} + +define amdgpu_gfx float @simple_stack_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %res = call amdgpu_gfx float @simple_stack(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 { + %res = call amdgpu_gfx float @extern_func(float %arg0) + ret float %res +} + +define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %res = call amdgpu_gfx float @extern_func(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 { + %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0) + ret float %res +} + +define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 { + %fptr = load ptr, ptr addrspace(4) @funcptr + call amdgpu_gfx void %fptr() + ret float %arg0 +} + +define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %fptr = load ptr, ptr addrspace(4) @funcptr + call amdgpu_gfx void %fptr() + %add = fadd float %arg0, %val + ret float %add +} + +define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 { + %stack = alloca float, i32 4, align 4, addrspace(5) + store volatile float 2.0, ptr addrspace(5) %stack + %val = load volatile float, ptr addrspace(5) %stack + %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0) + %add = fadd float %res, %val + ret float %add +} + +@lds = internal addrspace(3) global [64 x float] undef + +define amdgpu_gfx float @simple_lds(float %arg0) #0 { + %val = load float, ptr addrspace(3) @lds + ret float %val +} + +define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 { + %val = load float, ptr addrspace(3) @lds + %res = call amdgpu_gfx float @simple_lds_recurse(float %val) + ret float %res +} + +attributes #0 = { nounwind } + +!amdgpu.pal.metadata.msgpack = !{!0} + +!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"} +!1 = !{i32 7} diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll index a70488a..a030f86 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll @@ -1,17 +1,20 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA,ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA,OBJ %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA,OBJ %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA,OBJ %s ; GCN: preload_kernarg_header ; HSA: s_trap 2 ; NON-HSA: s_endpgm -; GCN-COUNT-63: s_nop 0 +; ASM: .fill 63, 4, 0xbf800000 ; s_nop 0 +; OBJ-COUNT-63: s_nop 0 define amdgpu_kernel void @preload_kernarg_header(ptr %arg) { store ptr %arg, ptr %arg ret void } ; GCN: non_kernel_function +; GCN-NOT: s_trap 2 ; GCN-NOT: s_nop 0 ; GCN: flat_store define void @non_kernel_function(ptr %arg) { diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll index e7488e0..20edbd6 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll @@ -157,27 +157,27 @@ define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 { define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR2]] { +; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; NO-PRELOAD-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] { +; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; PRELOAD-1-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] { +; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; PRELOAD-3-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] { +; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; PRELOAD-16-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; PRELOAD-16-NEXT: ret void ; ; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic -; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] { +; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] { ; PRELOAD-20-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]]) ; PRELOAD-20-NEXT: ret void ; @@ -235,23 +235,23 @@ define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0, define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) { ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; NO-PRELOAD-NEXT: ret void ; ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; PRELOAD-1-NEXT: ret void ; ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; PRELOAD-3-NEXT: ret void ; ; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; PRELOAD-16-NEXT: ret void ; ; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes -; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { ; PRELOAD-20-NEXT: ret void ; ret void diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index d20c3a4..f0e709b 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -24,70 +24,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i8: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -98,70 +36,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i8: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -170,70 +46,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i8: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -242,70 +56,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i8: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -325,70 +77,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i8: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -399,70 +89,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i8: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -471,70 +99,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i8: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -543,70 +109,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) { ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i8: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -631,70 +135,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -705,70 +147,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 @@ -778,70 +158,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_mov_b32 s0, 0xffff ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 @@ -851,70 +169,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 @@ -935,70 +191,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -1009,70 +203,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 @@ -1082,70 +214,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_mov_b32 s0, 0xffff ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 @@ -1155,70 +225,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 @@ -1244,70 +252,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -1318,70 +264,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -1390,70 +274,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -1462,70 +284,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -1545,70 +305,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -1619,70 +317,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -1691,70 +327,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -1763,70 +337,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -1850,70 +362,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -1923,70 +373,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4 @@ -1994,70 +382,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4 @@ -2065,70 +391,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4 @@ -2146,70 +410,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -2219,70 +421,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8 @@ -2290,70 +430,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8 @@ -2361,70 +439,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8 @@ -2449,70 +465,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s3, s[0:1], 0x10 ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 @@ -2524,70 +478,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x10 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -2598,70 +490,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s2, s6 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -2670,70 +500,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s2, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -2754,70 +522,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10 ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 @@ -2829,70 +535,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -2903,70 +547,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_add_i32 s0, s6, s10 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -2975,70 +557,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1) ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s6, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -3065,70 +585,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -3141,70 +599,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff @@ -3217,70 +613,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16 ; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff @@ -3291,70 +625,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16 ; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff @@ -3378,70 +650,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -3454,70 +664,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff @@ -3530,70 +678,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 16 ; GFX90a-PRELOAD-4-NEXT: s_and_b32 s1, s8, 0xffff @@ -3604,70 +690,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 % ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16 ; GFX90a-PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff @@ -3695,70 +719,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -3768,70 +730,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -3841,70 +741,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -3914,70 +752,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -3997,70 +773,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -4070,70 +784,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -4143,70 +795,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -4216,70 +806,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8> ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -4308,70 +836,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: byref_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -4385,70 +851,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: byref_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -4462,70 +866,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: byref_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -4539,70 +881,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: byref_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -4630,70 +910,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -4707,70 +925,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 @@ -4784,70 +940,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 @@ -4861,70 +955,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: byref_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 @@ -4964,70 +996,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v8i32_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 @@ -5046,70 +1016,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v8i32_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 @@ -5128,70 +1036,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v8i32_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 @@ -5210,70 +1056,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v8i32_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 @@ -5311,70 +1095,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v8i32_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0 @@ -5393,70 +1115,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v8i32_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0 @@ -5475,70 +1135,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v8i32_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0 @@ -5557,70 +1155,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32> ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v8i32_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0 @@ -5654,70 +1190,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -5729,70 +1203,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5 @@ -5802,70 +1214,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5 @@ -5875,70 +1225,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5 @@ -5959,70 +1247,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -6034,70 +1260,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9 @@ -6107,70 +1271,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9 @@ -6180,70 +1282,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9 @@ -6269,70 +1309,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 @@ -6344,70 +1322,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7 @@ -6417,70 +1333,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s7 @@ -6490,70 +1344,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7 @@ -6575,70 +1367,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 @@ -6650,70 +1380,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11 @@ -6723,70 +1391,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s11 @@ -6796,70 +1402,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11 @@ -6885,70 +1429,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 @@ -6960,70 +1442,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6 @@ -7033,70 +1453,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s6 @@ -7106,70 +1464,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6 @@ -7191,70 +1487,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0 @@ -7266,70 +1500,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10 @@ -7339,70 +1511,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10 @@ -7412,70 +1522,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0 ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10 @@ -7500,70 +1548,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -7575,70 +1561,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -7655,70 +1579,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -7735,70 +1597,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8 ; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -7826,70 +1626,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0 @@ -7901,70 +1639,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -7981,70 +1657,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -8061,70 +1675,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5 ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8 ; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -8167,70 +1719,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v5f64_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -8252,70 +1742,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v5f64_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -8337,70 +1765,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v5f64_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -8422,70 +1788,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v5f64_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60 ; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40 @@ -8529,70 +1833,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v5f64_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -8614,70 +1856,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v5f64_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -8699,70 +1879,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v5f64_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -8784,70 +1902,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v5f64_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60 ; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40 @@ -8882,70 +1938,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -8955,70 +1949,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8 ; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9042,70 +1974,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s5, 8 ; GFX940-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9129,70 +1999,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8 ; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9225,70 +2033,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -9298,70 +2044,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8 ; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9384,70 +2068,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s9, 8 ; GFX90a-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9470,70 +2092,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8 ; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0 @@ -9570,70 +2130,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -9643,70 +2141,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -9714,70 +2150,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -9785,70 +2159,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -9866,70 +2178,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -9939,70 +2189,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10010,70 +2198,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10081,70 +2207,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a) ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10166,70 +2230,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 -; GFX940-PRELOAD-1-NEXT: s_nop 0 +; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-1-NEXT: ; %bb.0: ; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8 ; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -10239,70 +2241,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-PRELOAD-1-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 -; GFX940-PRELOAD-2-NEXT: s_nop 0 +; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-2-NEXT: ; %bb.0: ; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -10310,70 +2250,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-PRELOAD-2-NEXT: s_endpgm ; ; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 -; GFX940-PRELOAD-4-NEXT: s_nop 0 +; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-4-NEXT: ; %bb.0: ; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -10381,70 +2259,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX940-PRELOAD-4-NEXT: s_endpgm ; ; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 -; GFX940-PRELOAD-8-NEXT: s_nop 0 +; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX940-PRELOAD-8-NEXT: ; %bb.0: ; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5] @@ -10462,70 +2278,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-NO-PRELOAD-NEXT: s_endpgm ; ; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 -; GFX90a-PRELOAD-1-NEXT: s_nop 0 +; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-1-NEXT: ; %bb.0: ; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0 @@ -10535,70 +2289,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-PRELOAD-1-NEXT: s_endpgm ; ; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 -; GFX90a-PRELOAD-2-NEXT: s_nop 0 +; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-2-NEXT: ; %bb.0: ; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10606,70 +2298,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-PRELOAD-2-NEXT: s_endpgm ; ; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 -; GFX90a-PRELOAD-4-NEXT: s_nop 0 +; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-4-NEXT: ; %bb.0: ; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] @@ -10677,70 +2307,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double ; GFX90a-PRELOAD-4-NEXT: s_endpgm ; ; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg: -; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments. -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 -; GFX90a-PRELOAD-8-NEXT: s_nop 0 +; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. +; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0 ; GFX90a-PRELOAD-8-NEXT: ; %bb.0: ; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0 ; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll new file mode 100644 index 0000000..ab03177 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll @@ -0,0 +1,69 @@ +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s +; REQUIRES: asserts + +; CHECK: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4 +; CHECK-NEXT: => Final Score:1 +; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4 +; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1 +; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4 +; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1 +; CHECK-NEXT: => Final Score:4 +; CHECK-NEXT: Sorted Worklist: +; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5) +define amdgpu_kernel void @simple_users_scores() #0 { +entry: + ; should get a score of 1 + %simpleuser = alloca [4 x i64], align 4, addrspace(5) + ; should get a score of 4 + %manyusers = alloca [4 x i64], align 4, addrspace(5) + + store i32 42, ptr addrspace(5) %simpleuser + + %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2 + %v0 = load i8, ptr addrspace(5) %manyusers.1 + %v0.ext = zext i8 %v0 to i32 + store i32 %v0.ext, ptr addrspace(5) %manyusers.1 + + %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1 + %v1 = load i8, ptr addrspace(5) %manyusers.2 + %v1.ext = zext i8 %v0 to i32 + store i32 %v1.ext, ptr addrspace(5) %manyusers.2 + + ret void +} + +; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4 +; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4 +; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4 +; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1 +; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4 +; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1 +; CHECK-NEXT: => Final Score:30 +define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 { +entry: + ; should get a score of 1 + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4 + %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8 + + store i32 42, ptr addrspace(5) %stack + br label %loop.outer + +loop.outer: + store i32 32, ptr addrspace(5) %stack + %outer.cmp = load i1, ptr addrspace(5) %stack.1 + br label %loop.inner + +loop.inner: + store i32 32, ptr addrspace(5) %stack.1 + %inner.cmp = load i1, ptr addrspace(5) %stack.2 + br i1 %inner.cmp, label %loop.inner, label %loop.outer + +exit: + store i32 64, ptr addrspace(5) %stack.2 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll index d92ba77..d070dc3 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll @@ -203,13 +203,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" } attributes #6 = { "amdgpu-flat-work-group-size"="512,512" } attributes #7 = { "amdgpu-flat-work-group-size"="64,256" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll index 2df219b..f62f1d5 100644 --- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll +++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll @@ -399,26 +399,26 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" } attributes #18 = { "amdgpu-waves-per-eu"="9,10" } attributes #19 = { "amdgpu-waves-per-eu"="8,9" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir index 2ccc241..fdfc9b0 100644 --- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir +++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir @@ -24,6 +24,7 @@ registers: - { id: 10, class: sreg_64_xexec, preferred-register: '$vcc' } frameInfo: maxAlignment: 1 + adjustsStack: true hasCalls: true machineFunctionInfo: maxKernArgAlign: 1 diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll index eaef63b..c1d647c 100644 --- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll +++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll @@ -19,5 +19,5 @@ define void @hoge() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll index 297a056..384a9c4 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll @@ -191,11 +191,11 @@ define amdgpu_kernel void @kernel_lds_recursion() { !1 = !{i32 1, !"amdhsa_code_object_version", i32 400} ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index c0d1999..0903770 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -181,6 +181,8 @@ legalized: false regBankSelected: false selected: false tracksRegLiveness: true +frameInfo: + adjustsStack: true liveins: - { reg: '$vgpr0', virtual-reg: '%0' } - { reg: '$vgpr1', virtual-reg: '%1' } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir index efbdbca..c6ccbd9 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -78,6 +78,7 @@ name: sgpr_spill_wrong_stack_id tracksRegLiveness: true frameInfo: + adjustsStack: true hasCalls: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll index 764f494..f523b4a 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,7 +16,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill @@ -150,7 +150,6 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v255, 1 @@ -270,7 +269,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -311,7 +310,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -444,7 +443,6 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v254, 1 @@ -563,7 +561,7 @@ define void @spill_to_lowest_available_vgpr() #0 { ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -1530,7 +1528,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s24, s33 +; GCN-NEXT: s_mov_b32 s18, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -1668,7 +1666,6 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_mov_b64 exec, 1 @@ -1801,7 +1798,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s24 +; GCN-NEXT: s_mov_b32 s33, s18 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void @child_function_ipra() diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index f229f33..539cfc7 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -73,7 +73,7 @@ define amdgpu_kernel void @test_simple_indirect_call() { ;. ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" } ;. -; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir index 3558298..f8ec6bb 100644 --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -21,6 +21,7 @@ name: kernel tracksRegLiveness: true frameInfo: + adjustsStack: true hasCalls: true machineFunctionInfo: isEntryFunction: true diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll index b8bc01e..c6a5990 100644 --- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll +++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll @@ -916,13 +916,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_mov_b32 s32, 0x1200 -; WAVE32-O0-NEXT: s_getpc_b64 s[24:25] -; WAVE32-O0-NEXT: s_mov_b32 s24, s0 -; WAVE32-O0-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0 +; WAVE32-O0-NEXT: s_getpc_b64 s[20:21] +; WAVE32-O0-NEXT: s_mov_b32 s20, s0 +; WAVE32-O0-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-O0-NEXT: s_bitset0_b32 s27, 21 -; WAVE32-O0-NEXT: s_add_u32 s24, s24, s9 -; WAVE32-O0-NEXT: s_addc_u32 s25, s25, 0 +; WAVE32-O0-NEXT: s_bitset0_b32 s23, 21 +; WAVE32-O0-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-O0-NEXT: s_addc_u32 s21, s21, 0 ; WAVE32-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane ; WAVE32-O0-NEXT: s_mov_b32 s14, s8 ; WAVE32-O0-NEXT: s_mov_b32 s13, s7 @@ -934,17 +934,17 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[24:25] -; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[26:27] +; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[20:21] +; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[22:23] ; WAVE32-O0-NEXT: s_mov_b32 s6, s32 ; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4 +; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 ; WAVE32-O0-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi ; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1018,11 +1018,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1 -; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1 +; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s1, v0, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s0, v0, 0 @@ -1137,7 +1136,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload @@ -1155,13 +1153,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, 0x1200 -; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[24:25] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s0 -; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0 +; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[20:21] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s20, s0 +; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0) -; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s27, 21 -; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s24, s24, s9 -; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s25, s25, 0 +; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s23, 21 +; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s20, s20, s9 +; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s21, s21, 0 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s14, s8 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s13, s7 @@ -1174,13 +1172,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5 ; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 42 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], 0 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[24:25] -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[26:27] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[20:21] +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[22:23] ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, s32 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 17 -; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4 +; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo ; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17 @@ -1254,7 +1252,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s1, v32, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s0, v32, 0 @@ -1347,7 +1344,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-O0: ; %bb.0: ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-O0-NEXT: s_mov_b32 s26, s33 +; WAVE32-O0-NEXT: s_mov_b32 s25, s33 ; WAVE32-O0-NEXT: s_mov_b32 s33, s32 ; WAVE32-O0-NEXT: s_xor_saveexec_b32 s16, -1 ; WAVE32-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1361,9 +1358,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 0 ; WAVE32-O0-NEXT: s_lshr_b32 s16, s16, 5 ; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 1 -; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25 +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 ; WAVE32-O0-NEXT: v_mov_b32_e32 v0, 42 ; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1440,11 +1437,10 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1 +; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1 ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25 +; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: v_readlane_b32 s5, v0, 1 ; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0 @@ -1460,14 +1456,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0xffffee00 -; WAVE32-O0-NEXT: s_mov_b32 s33, s26 +; WAVE32-O0-NEXT: s_mov_b32 s33, s25 ; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE32-O0-NEXT: s_setpc_b64 s[30:31] ; ; WAVE64-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE64-O0: ; %bb.0: ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE64-O0-NEXT: s_mov_b32 s28, s33 +; WAVE64-O0-NEXT: s_mov_b32 s19, s33 ; WAVE64-O0-NEXT: s_mov_b32 s33, s32 ; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; WAVE64-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1560,7 +1556,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18 ; WAVE64-O0-NEXT: ; implicit-def: $sgpr18 ; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18 -; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE64-O0-NEXT: s_or_saveexec_b64 s[26:27], -1 ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload @@ -1580,14 +1575,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload ; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5] ; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0xffffdc00 -; WAVE64-O0-NEXT: s_mov_b32 s33, s28 +; WAVE64-O0-NEXT: s_mov_b32 s33, s19 ; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) ; WAVE64-O0-NEXT: s_setpc_b64 s[30:31] ; ; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_stackrestore_call_with_stack_objects: ; WAVE32-WWM-PREALLOC: ; %bb.0: ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s25, s33 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s33 ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s32 ; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s16, -1 ; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill @@ -1677,7 +1672,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18 ; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18 ; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18 -; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19 ; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17] ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s5, v32, 1 ; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v32, 0 @@ -1693,7 +1687,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() { ; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload ; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4 ; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0xffffee00 -; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s25 +; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s24 ; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) ; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31] %alloca = alloca [32 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll index 8d5dc79..049db01 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll @@ -31,6 +31,6 @@ define amdgpu_kernel void @kernel1() #1 { attributes #0 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll index 7a6f82d..c9387f1 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll @@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel2() #0 { attributes #0 = { "uniform-work-group-size"="true" } ;. ; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll index c04154c..7183da2 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll @@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 { attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll index 2d5ff04..6ed04cf 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll @@ -41,7 +41,7 @@ define amdgpu_kernel void @kernel2() #2 { attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll index e8bf6fc..d5ba2fd 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll @@ -52,8 +52,8 @@ attributes #0 = { nounwind } attributes #1 = { "uniform-work-group-size"="false" } attributes #2 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll index 473eea4..7f0dfea 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll @@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 { attributes #0 = { nounwind readnone } attributes #1 = { "uniform-work-group-size"="true" } ;. -; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } -; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } +; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll index 221f1a1..8616c73 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll @@ -61,6 +61,6 @@ define amdgpu_kernel void @kernel3() #0 { attributes #0 = { "uniform-work-group-size"="false" } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } -; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll index 717d3d9..0407994 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll @@ -540,6 +540,7 @@ define internal void @use512vgprs() { } define void @foo() #0 { + call void asm sideeffect "; use $0", "a"(i32 0) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll index d2364a6..bfc249e 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -233,10 +233,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %47:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -249,8 +249,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 - ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -286,8 +286,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 - ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6 + ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 @@ -356,9 +356,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.1.Flow: ; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9 - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9 - ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9 + ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.2 ; SI-NEXT: {{ $}} @@ -371,7 +371,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.3: ; SI-NEXT: successors: %bb.4(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 + ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2 ; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1 @@ -407,7 +407,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: bb.7: ; SI-NEXT: successors: %bb.8(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 + ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6 ; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec ; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec ; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll index 37f207f..4939d526 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -47,7 +47,6 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id ; CHECK-NEXT: s_mov_b32 s15, 42 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] -; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir index 3d9db68..6659e95 100644 --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -20,6 +20,7 @@ name: undef_identity_copy tracksRegLiveness: true frameInfo: maxAlignment: 4 + adjustsStack: true hasCalls: true machineFunctionInfo: isEntryFunction: true diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 82816b4..901e88a 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2479,8 +2479,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1 -; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000 -; GFX1032-NEXT: s_add_i32 s1, s1, 32 +; GFX1032-NEXT: s_brev_b32 s1, 1 ; GFX1032-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2 ; GFX1032-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -2494,8 +2493,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 -; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1032-NEXT: s_min_u32 s0, s0, s1 +; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -2529,10 +2527,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 ; GFX1064-NEXT: s_bitset1_b32 s1, 31 -; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1064-NEXT: s_ff1_i32_b32 s1, s1 -; GFX1064-NEXT: s_add_i32 s1, s1, 32 -; GFX1064-NEXT: s_min_u32 s0, s0, s1 +; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] @@ -2576,9 +2571,8 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 ; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0 -; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000 +; GFX1032-NEXT: s_brev_b32 s1, 1 ; GFX1032-NEXT: v_rcp_f32_e32 v2, v1 -; GFX1032-NEXT: s_add_i32 s1, s1, 32 ; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2 ; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2 @@ -2592,8 +2586,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 ; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1032-NEXT: s_min_u32 s0, s0, s1 +; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 ; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -2609,15 +2602,15 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 -; GFX1064-NEXT: v_div_scale_f32 v4, vcc, v0, s2, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v2, v1 ; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v2 -; GFX1064-NEXT: v_fma_f32 v5, -v1, v3, v4 -; GFX1064-NEXT: v_fmac_f32_e32 v3, v5, v2 -; GFX1064-NEXT: v_fma_f32 v1, -v1, v3, v4 -; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 +; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0 +; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2 +; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3 +; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2 +; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3 +; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4 ; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0 ; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0 @@ -2625,10 +2618,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 ; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_bitset1_b32 s1, 31 -; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 -; GFX1064-NEXT: s_ff1_i32_b32 s1, s1 -; GFX1064-NEXT: s_add_i32 s1, s1, 32 -; GFX1064-NEXT: s_min_u32 s0, s0, s1 +; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll index 3a33194..7eabe98 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll @@ -101,7 +101,6 @@ define void @test() #0 { ; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1] ; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21] ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 11f6a29..e79cb66 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -406,7 +406,6 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[44:45] ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 -; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 @@ -633,7 +632,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i ; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index e5cebc1..def51f2 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -413,7 +413,6 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload @@ -657,7 +656,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 @@ -1285,7 +1283,6 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O0-NEXT: ; implicit-def: $sgpr15 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload @@ -1529,7 +1526,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6 ; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 |