diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
45 files changed, 2037 insertions, 2197 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll index 41fda6d..efa51ea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -90,26 +90,24 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) { ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9 ; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11 -; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10 +; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v13, v[0:1], off offset:10 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa ; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0 -; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 8, v2 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8 -; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v3, 16, v4 :: v_dual_lshlrev_b32 v2, 24, v5 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v1, 24, v5 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6 -; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v3, v7, 8, v6 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4 -; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v6, 16, v8 :: v_dual_lshlrev_b32 v5, 24, v9 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v5, 16, v8 :: v_dual_lshlrev_b32 v4, 24, v9 +; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2 -; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10 -; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1 -; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12 +; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v10 ; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1 -; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4 -; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7 +; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v7, 24, v12 :: v_dual_lshlrev_b32 v8, 16, v13 +; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v3 +; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v6 ; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31] ; ; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -942,7 +940,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; ; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX1250-NOUNALIGNED: ; %bb.0: -; GFX1250-NOUNALIGNED-NEXT: s_clause 0xa +; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1 ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3 ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s4, s[0:1], 0x2 @@ -954,27 +952,26 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s10, s[0:1], 0x0 ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4 ; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa -; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0 -; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8 +; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s13, s[0:1], 0x8 ; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s5, 8 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s3 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s6, 24 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s7, 16 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s8, 8 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 24 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s5, 8 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s1, s2 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s6, 24 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s7, 16 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s8, 8 ; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s10 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s8, s9, 24 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s2, s0 -; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s12, 16 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s11 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s5, s6 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s9, 24 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s1, s0 +; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s12, 16 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s3, s11 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s5 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s6, s13 ; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s5, s7, s1 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s8, s2 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s4, s3 -; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s3, s2 +; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s5, s4 ; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog ; ; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1: @@ -1351,11 +1348,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_v3i32_align4: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align4: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align4: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_v3i32_align4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v3i32_align4: ; GFX9: ; %bb.0: @@ -1388,11 +1399,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg } define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_i96_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_i96_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_i96_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_i96_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_i96_align8: ; GFX9: ; %bb.0: @@ -1425,11 +1450,25 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) { } define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_v3i32_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_v3i32_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v3i32_align8: ; GFX9: ; %bb.0: @@ -1462,11 +1501,25 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg } define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_v6i16_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_v6i16_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v6i16_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_v6i16_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v6i16_align8: ; GFX9: ; %bb.0: @@ -1500,24 +1553,64 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg } define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) { -; GFX12-LABEL: s_load_constant_v12i8_align8: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s13, s0, 8 -; GFX12-NEXT: s_lshr_b32 s12, s0, 16 -; GFX12-NEXT: s_lshr_b32 s3, s0, 24 -; GFX12-NEXT: s_lshr_b32 s5, s1, 8 -; GFX12-NEXT: s_lshr_b32 s6, s1, 16 -; GFX12-NEXT: s_lshr_b32 s7, s1, 24 -; GFX12-NEXT: s_lshr_b32 s9, s2, 8 -; GFX12-NEXT: s_lshr_b32 s10, s2, 16 -; GFX12-NEXT: s_lshr_b32 s11, s2, 24 -; GFX12-NEXT: s_mov_b32 s4, s1 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s1, s13 -; GFX12-NEXT: s_mov_b32 s2, s12 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-UNALIGNED-LABEL: s_load_constant_v12i8_align8: +; GFX12-UNALIGNED: ; %bb.0: +; GFX12-UNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s13, s0, 8 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s12, s0, 16 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s3, s0, 24 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s5, s1, 8 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s6, s1, 16 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s7, s1, 24 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s9, s2, 8 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s10, s2, 16 +; GFX12-UNALIGNED-NEXT: s_lshr_b32 s11, s2, 24 +; GFX12-UNALIGNED-NEXT: s_mov_b32 s4, s1 +; GFX12-UNALIGNED-NEXT: s_mov_b32 s8, s2 +; GFX12-UNALIGNED-NEXT: s_mov_b32 s1, s13 +; GFX12-UNALIGNED-NEXT: s_mov_b32 s2, s12 +; GFX12-UNALIGNED-NEXT: ; return to shader part epilog +; +; GFX12-NOUNALIGNED-LABEL: s_load_constant_v12i8_align8: +; GFX12-NOUNALIGNED: ; %bb.0: +; GFX12-NOUNALIGNED-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s13, s0, 8 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s12, s0, 16 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s3, s0, 24 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s5, s1, 8 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s6, s1, 16 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s7, s1, 24 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s9, s2, 8 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s10, s2, 16 +; GFX12-NOUNALIGNED-NEXT: s_lshr_b32 s11, s2, 24 +; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s4, s1 +; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s8, s2 +; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s1, s13 +; GFX12-NOUNALIGNED-NEXT: s_mov_b32 s2, s12 +; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_load_constant_v12i8_align8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshr_b32 s13, s0, 8 +; GFX1250-NEXT: s_lshr_b32 s12, s0, 16 +; GFX1250-NEXT: s_lshr_b32 s3, s0, 24 +; GFX1250-NEXT: s_lshr_b32 s5, s1, 8 +; GFX1250-NEXT: s_lshr_b32 s6, s1, 16 +; GFX1250-NEXT: s_lshr_b32 s7, s1, 24 +; GFX1250-NEXT: s_lshr_b32 s9, s2, 8 +; GFX1250-NEXT: s_lshr_b32 s10, s2, 16 +; GFX1250-NEXT: s_lshr_b32 s11, s2, 24 +; GFX1250-NEXT: s_mov_b32 s4, s1 +; GFX1250-NEXT: s_mov_b32 s8, s2 +; GFX1250-NEXT: s_mov_b32 s1, s13 +; GFX1250-NEXT: s_mov_b32 s2, s12 +; GFX1250-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_load_constant_v12i8_align8: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll index aac499f..b486fabb 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll @@ -9,15 +9,14 @@ target triple = "amdgcn-amd-amdhsa" define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { ; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0 -; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s2, -1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_lshlrev_b32 v1, 20, v0 +; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s0, -1 ; GFX1250-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1] ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1 ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo @@ -56,13 +55,11 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1] ; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: s_endpgm @@ -91,10 +88,9 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) { ; GFX1250-LABEL: use_flat_to_private_addrspacecast: ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS @@ -110,9 +106,8 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 ; GFX1250-SDAG-NEXT: s_endpgm @@ -122,9 +117,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) { ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX1250-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll new file mode 100644 index 0000000..0d68762 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-flat-scratch-init-asan.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes='amdgpu-attributor' %s -o - | FileCheck %s + +@lds_1 = internal addrspace(3) global [1 x i8] poison, align 4 + +;. +; CHECK: @lds_1 = internal addrspace(3) global [1 x i8] poison, align 4 +;. +define amdgpu_kernel void @k0() #0 { +; CHECK: Function Attrs: sanitize_address +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: store i8 7, ptr addrspace(3) @lds_1, align 4 +; CHECK-NEXT: ret void +; + store i8 7, ptr addrspace(3) @lds_1, align 4 + ret void +} + +attributes #0 = { sanitize_address } +; "amdgpu-no-flat-scratch-init" attribute should not be present in attribute list +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" } +;. diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll index a688b6f..fb566e5 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll @@ -707,8 +707,8 @@ attributes #6 = { "enqueued-block" } ; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR15:[0-9]+]] = { nounwind "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } -; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind sanitize_address "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR19:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR20:[0-9]+]] = { "enqueued-block" "uniform-work-group-size"="false" } ; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "enqueued-block" "uniform-work-group-size"="false" } diff --git a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll index ef52694..54871a6 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-system-scope.ll @@ -538,58 +538,61 @@ define double @flat_system_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB34_6 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: s_cbranch_execnz .LBB34_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow2 +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB34_8 +; GFX1250-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB34_3: ; %atomicrmw.check.private +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB34_3 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execz .LBB34_5 +; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB34_3: ; %Flow +; GFX1250-NEXT: .LBB34_5: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB34_5 -; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB34_7 +; GFX1250-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB34_5: ; %Flow1 +; GFX1250-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-NEXT: .LBB34_7: ; %Flow1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB34_6: ; %Flow2 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB34_8 -; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-NEXT: s_cbranch_execz .LBB34_2 +; GFX1250-NEXT: .LBB34_8: ; %atomicrmw.shared +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3] -; GFX1250-NEXT: .LBB34_8: ; %atomicrmw.phi +; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo +; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fadd ptr %ptr, double %val monotonic ret double %result @@ -600,58 +603,61 @@ define double @flat_one_as_atomic_fadd_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 ; GFX1250-NEXT: s_mov_b64 s[0:1], src_shared_base ; GFX1250-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 -; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmpx_ne_u32_e64 s1, v5 ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB35_6 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s1, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: s_cbranch_execnz .LBB35_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow2 +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB35_8 +; GFX1250-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB35_3: ; %atomicrmw.check.private +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s1, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB35_3 -; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.global -; GFX1250-NEXT: global_atomic_add_f64 v[4:5], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execz .LBB35_5 +; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[4:5], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB35_3: ; %Flow +; GFX1250-NEXT: .LBB35_5: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s1, s1 -; GFX1250-NEXT: s_cbranch_execz .LBB35_5 -; GFX1250-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB35_7 +; GFX1250-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s2, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_add_f64_e32 v[0:1], v[4:5], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB35_5: ; %Flow1 +; GFX1250-NEXT: v_add_f64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-NEXT: .LBB35_7: ; %Flow1 ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB35_6: ; %Flow2 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB35_8 -; GFX1250-NEXT: ; %bb.7: ; %atomicrmw.shared -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-NEXT: s_cbranch_execz .LBB35_2 +; GFX1250-NEXT: .LBB35_8: ; %atomicrmw.shared +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: ds_add_rtn_f64 v[4:5], v0, v[2:3] -; GFX1250-NEXT: .LBB35_8: ; %atomicrmw.phi +; GFX1250-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo +; GFX1250-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fadd ptr %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -686,40 +692,42 @@ define double @flat_system_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB38_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB38_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB38_4 +; GFX1250-NEXT: .LBB38_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB38_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB38_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB38_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB38_2 +; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB38_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val monotonic ret double %result @@ -730,40 +738,42 @@ define double @flat_one_as_atomic_fmin_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB39_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_min_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB39_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB39_4 +; GFX1250-NEXT: .LBB39_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB39_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_min_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB39_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB39_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB39_2 +; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB39_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_min_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmin ptr %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -798,40 +808,42 @@ define double @flat_system_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB42_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB42_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB42_4 +; GFX1250-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB42_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB42_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB42_2 +; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB42_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val monotonic ret double %result @@ -842,40 +854,42 @@ define double @flat_one_as_atomic_fmax_f64(ptr %ptr, double %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 -; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 +; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 +; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB43_2 -; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global -; GFX1250-NEXT: flat_atomic_max_num_f64 v[4:5], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: s_cbranch_execnz .LBB43_3 +; GFX1250-NEXT: ; %bb.1: ; %Flow +; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-NEXT: s_cbranch_execnz .LBB43_4 +; GFX1250-NEXT: .LBB43_2: ; %atomicrmw.phi +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-NEXT: flat_atomic_max_num_f64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-NEXT: .LBB43_2: ; %Flow ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 -; GFX1250-NEXT: s_cbranch_execz .LBB43_4 -; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo -; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-NEXT: s_cbranch_execz .LBB43_2 +; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v4, vcc_lo -; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off +; GFX1250-NEXT: v_dual_max_num_f64 v[2:3], v[2:3], v[2:3] :: v_dual_cndmask_b32 v6, -1, v0, vcc_lo +; GFX1250-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[4:5] -; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: scratch_store_b64 v6, v[0:1], off -; GFX1250-NEXT: .LBB43_4: ; %atomicrmw.phi +; GFX1250-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1] +; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[2:3] +; GFX1250-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fmax ptr %ptr, double %val syncscope("one-as") monotonic ret double %result @@ -982,13 +996,11 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1000,10 +1012,9 @@ define i64 @flat_one_as_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB52_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1025,13 +1036,11 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1043,10 +1052,9 @@ define i64 @flat_system_atomic_min_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB53_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1068,13 +1076,11 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB54_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1086,10 +1092,9 @@ define i64 @flat_one_as_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB54_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1111,13 +1116,11 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB55_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1129,10 +1132,9 @@ define i64 @flat_system_atomic_max_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB55_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1154,13 +1156,11 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB56_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1172,10 +1172,9 @@ define i64 @flat_one_as_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB56_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1197,13 +1196,11 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB57_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1215,10 +1212,9 @@ define i64 @flat_system_atomic_umin_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB57_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1240,13 +1236,11 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1258,10 +1252,9 @@ define i64 @flat_one_as_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB58_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off @@ -1283,13 +1276,11 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v4 ; GFX1250-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-NEXT: ; %bb.1: ; %atomicrmw.global @@ -1301,10 +1292,9 @@ define i64 @flat_system_atomic_umax_i64(ptr %ptr, i64 %val) { ; GFX1250-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-NEXT: s_cbranch_execz .LBB59_4 ; GFX1250-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, s1, v0 +; GFX1250-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-NEXT: scratch_load_b64 v[4:5], v6, off diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 94ba5cd..6b5647e 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -569,10 +569,10 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: v_dual_mov_b32 v9, v1 :: v_dual_mov_b32 v8, v0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_load_b128 v[0:3], v[4:5], off -; GFX1250-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16 +; GFX1250-NEXT: global_load_b128 v[0:3], v[8:9], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[8:9], off offset:16 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <16 x bfloat>, ptr addrspace(1) %ptr @@ -752,12 +752,12 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0 +; GFX1250-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_mov_b32 v16, v0 ; GFX1250-NEXT: s_clause 0x3 -; GFX1250-NEXT: global_load_b128 v[0:3], v[12:13], off -; GFX1250-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16 -; GFX1250-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32 -; GFX1250-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48 +; GFX1250-NEXT: global_load_b128 v[0:3], v[16:17], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[16:17], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[16:17], off offset:32 +; GFX1250-NEXT: global_load_b128 v[12:15], v[16:17], off offset:48 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <32 x bfloat>, ptr addrspace(1) %ptr @@ -1055,16 +1055,16 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0 +; GFX1250-NEXT: v_dual_mov_b32 v33, v1 :: v_dual_mov_b32 v32, v0 ; GFX1250-NEXT: s_clause 0x7 -; GFX1250-NEXT: global_load_b128 v[0:3], v[28:29], off -; GFX1250-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16 -; GFX1250-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32 -; GFX1250-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48 -; GFX1250-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64 -; GFX1250-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80 -; GFX1250-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96 -; GFX1250-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112 +; GFX1250-NEXT: global_load_b128 v[0:3], v[32:33], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[32:33], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[32:33], off offset:32 +; GFX1250-NEXT: global_load_b128 v[12:15], v[32:33], off offset:48 +; GFX1250-NEXT: global_load_b128 v[16:19], v[32:33], off offset:64 +; GFX1250-NEXT: global_load_b128 v[20:23], v[32:33], off offset:80 +; GFX1250-NEXT: global_load_b128 v[24:27], v[32:33], off offset:96 +; GFX1250-NEXT: global_load_b128 v[28:31], v[32:33], off offset:112 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <64 x bfloat>, ptr addrspace(1) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index 243f0ed..f8655a7 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -256,7 +256,6 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_wait_storecnt 0x0 ; GCN-NEXT: .LBB5_3: ; %bb4 ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_wait_xcnt 0x0 ; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index ddd3b152..363a248 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -2700,142 +2700,142 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) { ; ; GFX1250-LABEL: amd_kernel_v32i8: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b64_e32 v[8:9], 16 ; GFX1250-NEXT: v_mov_b64_e32 v[10:11], 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_lshr_b32 s16, s0, 16 -; GFX1250-NEXT: s_lshr_b32 s17, s0, 24 -; GFX1250-NEXT: s_lshr_b32 s20, s2, 16 -; GFX1250-NEXT: s_lshr_b32 s21, s2, 24 -; GFX1250-NEXT: s_lshr_b32 s14, s7, 16 -; GFX1250-NEXT: s_lshr_b32 s15, s7, 24 -; GFX1250-NEXT: s_bfe_u32 s27, s7, 0x80008 +; GFX1250-NEXT: s_lshr_b32 s16, s8, 16 +; GFX1250-NEXT: s_lshr_b32 s17, s8, 24 +; GFX1250-NEXT: s_lshr_b32 s6, s15, 16 +; GFX1250-NEXT: s_lshr_b32 s7, s15, 24 +; GFX1250-NEXT: s_bfe_u32 s27, s15, 0x80008 ; GFX1250-NEXT: s_add_co_i32 s17, s17, s17 ; GFX1250-NEXT: s_add_co_i32 s16, s16, s16 -; GFX1250-NEXT: s_lshr_b32 s18, s1, 16 -; GFX1250-NEXT: s_lshr_b32 s19, s1, 24 -; GFX1250-NEXT: s_lshr_b32 s22, s3, 16 -; GFX1250-NEXT: s_lshr_b32 s23, s3, 24 -; GFX1250-NEXT: s_bfe_u32 s29, s1, 0x80008 -; GFX1250-NEXT: s_bfe_u32 s30, s3, 0x80008 -; GFX1250-NEXT: s_add_co_i32 s21, s21, s21 -; GFX1250-NEXT: s_add_co_i32 s20, s20, s20 ; GFX1250-NEXT: s_lshl_b32 s17, s17, 8 ; GFX1250-NEXT: s_and_b32 s16, s16, 0xff -; GFX1250-NEXT: s_add_co_i32 s7, s7, s7 -; GFX1250-NEXT: s_add_co_i32 s27, s27, s27 ; GFX1250-NEXT: s_add_co_i32 s15, s15, s15 -; GFX1250-NEXT: s_add_co_i32 s14, s14, s14 -; GFX1250-NEXT: s_add_co_i32 s3, s3, s3 +; GFX1250-NEXT: s_add_co_i32 s27, s27, s27 +; GFX1250-NEXT: s_add_co_i32 s7, s7, s7 +; GFX1250-NEXT: s_add_co_i32 s6, s6, s6 +; GFX1250-NEXT: s_or_b32 s16, s16, s17 +; GFX1250-NEXT: s_and_b32 s15, s15, 0xff +; GFX1250-NEXT: s_lshl_b32 s17, s27, 8 +; GFX1250-NEXT: s_lshl_b32 s7, s7, 8 +; GFX1250-NEXT: s_and_b32 s6, s6, 0xff +; GFX1250-NEXT: s_or_b32 s15, s15, s17 +; GFX1250-NEXT: s_or_b32 s6, s6, s7 +; GFX1250-NEXT: s_bfe_u32 s26, s14, 0x80008 +; GFX1250-NEXT: s_and_b32 s7, s15, 0xffff +; GFX1250-NEXT: s_lshl_b32 s6, s6, 16 +; GFX1250-NEXT: s_lshr_b32 s20, s10, 16 +; GFX1250-NEXT: s_lshr_b32 s21, s10, 24 +; GFX1250-NEXT: s_lshr_b32 s4, s14, 16 +; GFX1250-NEXT: s_lshr_b32 s5, s14, 24 +; GFX1250-NEXT: s_or_b32 s6, s7, s6 +; GFX1250-NEXT: s_add_co_i32 s7, s14, s14 +; GFX1250-NEXT: s_add_co_i32 s26, s26, s26 +; GFX1250-NEXT: s_lshr_b32 s18, s9, 16 +; GFX1250-NEXT: s_lshr_b32 s19, s9, 24 +; GFX1250-NEXT: s_lshr_b32 s22, s11, 16 +; GFX1250-NEXT: s_lshr_b32 s23, s11, 24 +; GFX1250-NEXT: s_bfe_u32 s29, s9, 0x80008 +; GFX1250-NEXT: s_bfe_u32 s30, s11, 0x80008 +; GFX1250-NEXT: s_add_co_i32 s21, s21, s21 +; GFX1250-NEXT: s_add_co_i32 s20, s20, s20 +; GFX1250-NEXT: s_lshr_b32 s2, s13, 16 +; GFX1250-NEXT: s_lshr_b32 s3, s13, 24 +; GFX1250-NEXT: s_and_b32 s7, s7, 0xff +; GFX1250-NEXT: s_lshl_b32 s14, s26, 8 +; GFX1250-NEXT: s_add_co_i32 s5, s5, s5 +; GFX1250-NEXT: s_add_co_i32 s4, s4, s4 +; GFX1250-NEXT: s_add_co_i32 s11, s11, s11 ; GFX1250-NEXT: s_add_co_i32 s30, s30, s30 ; GFX1250-NEXT: s_add_co_i32 s23, s23, s23 ; GFX1250-NEXT: s_add_co_i32 s22, s22, s22 ; GFX1250-NEXT: s_lshl_b32 s21, s21, 8 ; GFX1250-NEXT: s_and_b32 s20, s20, 0xff -; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_add_co_i32 s9, s9, s9 ; GFX1250-NEXT: s_add_co_i32 s29, s29, s29 ; GFX1250-NEXT: s_add_co_i32 s19, s19, s19 ; GFX1250-NEXT: s_add_co_i32 s18, s18, s18 -; GFX1250-NEXT: s_lshr_b32 s10, s5, 16 -; GFX1250-NEXT: s_lshr_b32 s11, s5, 24 -; GFX1250-NEXT: s_lshr_b32 s12, s6, 16 -; GFX1250-NEXT: s_lshr_b32 s13, s6, 24 -; GFX1250-NEXT: s_or_b32 s16, s16, s17 -; GFX1250-NEXT: s_and_b32 s7, s7, 0xff -; GFX1250-NEXT: s_lshl_b32 s17, s27, 8 -; GFX1250-NEXT: s_lshl_b32 s15, s15, 8 -; GFX1250-NEXT: s_and_b32 s14, s14, 0xff -; GFX1250-NEXT: s_and_b32 s3, s3, 0xff +; GFX1250-NEXT: s_bfe_u32 s25, s13, 0x80008 +; GFX1250-NEXT: s_lshl_b32 s5, s5, 8 +; GFX1250-NEXT: s_and_b32 s4, s4, 0xff +; GFX1250-NEXT: s_or_b32 s7, s7, s14 +; GFX1250-NEXT: s_add_co_i32 s3, s3, s3 +; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_and_b32 s11, s11, 0xff ; GFX1250-NEXT: s_lshl_b32 s30, s30, 8 ; GFX1250-NEXT: s_lshl_b32 s23, s23, 8 ; GFX1250-NEXT: s_and_b32 s22, s22, 0xff ; GFX1250-NEXT: s_or_b32 s20, s20, s21 -; GFX1250-NEXT: s_and_b32 s1, s1, 0xff +; GFX1250-NEXT: s_and_b32 s9, s9, 0xff ; GFX1250-NEXT: s_lshl_b32 s21, s29, 8 ; GFX1250-NEXT: s_lshl_b32 s19, s19, 8 ; GFX1250-NEXT: s_and_b32 s18, s18, 0xff -; GFX1250-NEXT: s_lshr_b32 s8, s4, 16 -; GFX1250-NEXT: s_lshr_b32 s9, s4, 24 -; GFX1250-NEXT: s_bfe_u32 s24, s4, 0x80008 -; GFX1250-NEXT: s_bfe_u32 s25, s5, 0x80008 -; GFX1250-NEXT: s_bfe_u32 s26, s6, 0x80008 -; GFX1250-NEXT: s_or_b32 s7, s7, s17 -; GFX1250-NEXT: s_or_b32 s14, s14, s15 -; GFX1250-NEXT: s_add_co_i32 s13, s13, s13 -; GFX1250-NEXT: s_add_co_i32 s12, s12, s12 -; GFX1250-NEXT: s_add_co_i32 s11, s11, s11 -; GFX1250-NEXT: s_add_co_i32 s10, s10, s10 -; GFX1250-NEXT: s_bfe_u32 s28, s0, 0x80008 -; GFX1250-NEXT: s_or_b32 s3, s3, s30 +; GFX1250-NEXT: s_lshr_b32 s0, s12, 16 +; GFX1250-NEXT: s_lshr_b32 s1, s12, 24 +; GFX1250-NEXT: s_bfe_u32 s24, s12, 0x80008 +; GFX1250-NEXT: s_or_b32 s4, s4, s5 +; GFX1250-NEXT: s_and_b32 s5, s7, 0xffff +; GFX1250-NEXT: s_add_co_i32 s7, s13, s13 +; GFX1250-NEXT: s_add_co_i32 s25, s25, s25 +; GFX1250-NEXT: s_lshl_b32 s3, s3, 8 +; GFX1250-NEXT: s_and_b32 s2, s2, 0xff +; GFX1250-NEXT: s_bfe_u32 s28, s8, 0x80008 +; GFX1250-NEXT: s_or_b32 s11, s11, s30 ; GFX1250-NEXT: s_or_b32 s22, s22, s23 -; GFX1250-NEXT: s_bfe_u32 s23, s2, 0x80008 -; GFX1250-NEXT: s_or_b32 s1, s1, s21 +; GFX1250-NEXT: s_bfe_u32 s23, s10, 0x80008 +; GFX1250-NEXT: s_or_b32 s9, s9, s21 ; GFX1250-NEXT: s_or_b32 s18, s18, s19 -; GFX1250-NEXT: s_and_b32 s7, s7, 0xffff -; GFX1250-NEXT: s_lshl_b32 s14, s14, 16 -; GFX1250-NEXT: s_add_co_i32 s6, s6, s6 -; GFX1250-NEXT: s_add_co_i32 s26, s26, s26 -; GFX1250-NEXT: s_lshl_b32 s13, s13, 8 -; GFX1250-NEXT: s_and_b32 s12, s12, 0xff -; GFX1250-NEXT: s_add_co_i32 s5, s5, s5 -; GFX1250-NEXT: s_add_co_i32 s25, s25, s25 -; GFX1250-NEXT: s_lshl_b32 s11, s11, 8 -; GFX1250-NEXT: s_and_b32 s10, s10, 0xff -; GFX1250-NEXT: s_add_co_i32 s4, s4, s4 +; GFX1250-NEXT: s_lshl_b32 s4, s4, 16 +; GFX1250-NEXT: s_and_b32 s7, s7, 0xff +; GFX1250-NEXT: s_lshl_b32 s13, s25, 8 +; GFX1250-NEXT: s_or_b32 s2, s2, s3 +; GFX1250-NEXT: s_add_co_i32 s3, s12, s12 ; GFX1250-NEXT: s_add_co_i32 s24, s24, s24 -; GFX1250-NEXT: s_add_co_i32 s9, s9, s9 -; GFX1250-NEXT: s_add_co_i32 s8, s8, s8 -; GFX1250-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1250-NEXT: s_add_co_i32 s1, s1, s1 +; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_and_b32 s11, s11, 0xffff ; GFX1250-NEXT: s_lshl_b32 s22, s22, 16 -; GFX1250-NEXT: s_add_co_i32 s2, s2, s2 +; GFX1250-NEXT: s_add_co_i32 s10, s10, s10 ; GFX1250-NEXT: s_add_co_i32 s23, s23, s23 -; GFX1250-NEXT: s_and_b32 s1, s1, 0xffff +; GFX1250-NEXT: s_and_b32 s9, s9, 0xffff ; GFX1250-NEXT: s_lshl_b32 s18, s18, 16 -; GFX1250-NEXT: s_add_co_i32 s0, s0, s0 +; GFX1250-NEXT: s_add_co_i32 s8, s8, s8 ; GFX1250-NEXT: s_add_co_i32 s28, s28, s28 -; GFX1250-NEXT: s_or_b32 s7, s7, s14 -; GFX1250-NEXT: s_and_b32 s6, s6, 0xff -; GFX1250-NEXT: s_lshl_b32 s14, s26, 8 -; GFX1250-NEXT: s_or_b32 s12, s12, s13 -; GFX1250-NEXT: s_and_b32 s5, s5, 0xff -; GFX1250-NEXT: s_lshl_b32 s13, s25, 8 -; GFX1250-NEXT: s_or_b32 s10, s10, s11 -; GFX1250-NEXT: s_and_b32 s4, s4, 0xff -; GFX1250-NEXT: s_lshl_b32 s11, s24, 8 -; GFX1250-NEXT: s_lshl_b32 s9, s9, 8 -; GFX1250-NEXT: s_and_b32 s8, s8, 0xff -; GFX1250-NEXT: s_or_b32 s3, s3, s22 -; GFX1250-NEXT: s_and_b32 s2, s2, 0xff -; GFX1250-NEXT: s_lshl_b32 s22, s23, 8 -; GFX1250-NEXT: s_or_b32 s1, s1, s18 +; GFX1250-NEXT: s_or_b32 s4, s5, s4 +; GFX1250-NEXT: s_or_b32 s5, s7, s13 +; GFX1250-NEXT: s_and_b32 s3, s3, 0xff +; GFX1250-NEXT: s_lshl_b32 s7, s24, 8 +; GFX1250-NEXT: s_lshl_b32 s1, s1, 8 ; GFX1250-NEXT: s_and_b32 s0, s0, 0xff +; GFX1250-NEXT: s_or_b32 s11, s11, s22 +; GFX1250-NEXT: s_and_b32 s10, s10, 0xff +; GFX1250-NEXT: s_lshl_b32 s22, s23, 8 +; GFX1250-NEXT: s_or_b32 s9, s9, s18 +; GFX1250-NEXT: s_and_b32 s8, s8, 0xff ; GFX1250-NEXT: s_lshl_b32 s18, s28, 8 -; GFX1250-NEXT: s_or_b32 s6, s6, s14 -; GFX1250-NEXT: s_or_b32 s5, s5, s13 -; GFX1250-NEXT: s_or_b32 s4, s4, s11 -; GFX1250-NEXT: s_or_b32 s8, s8, s9 -; GFX1250-NEXT: s_or_b32 s2, s2, s22 -; GFX1250-NEXT: s_or_b32 s0, s0, s18 -; GFX1250-NEXT: s_and_b32 s6, s6, 0xffff -; GFX1250-NEXT: s_lshl_b32 s12, s12, 16 +; GFX1250-NEXT: s_or_b32 s3, s3, s7 +; GFX1250-NEXT: s_or_b32 s0, s0, s1 +; GFX1250-NEXT: s_or_b32 s10, s10, s22 +; GFX1250-NEXT: s_or_b32 s8, s8, s18 ; GFX1250-NEXT: s_and_b32 s5, s5, 0xffff -; GFX1250-NEXT: s_and_b32 s4, s4, 0xffff -; GFX1250-NEXT: s_lshl_b32 s8, s8, 16 -; GFX1250-NEXT: s_lshl_b32 s9, s10, 16 -; GFX1250-NEXT: s_and_b32 s2, s2, 0xffff +; GFX1250-NEXT: s_and_b32 s1, s3, 0xffff +; GFX1250-NEXT: s_lshl_b32 s0, s0, 16 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: s_and_b32 s10, s10, 0xffff ; GFX1250-NEXT: s_lshl_b32 s20, s20, 16 -; GFX1250-NEXT: s_and_b32 s0, s0, 0xffff +; GFX1250-NEXT: s_and_b32 s8, s8, 0xffff ; GFX1250-NEXT: s_lshl_b32 s16, s16, 16 -; GFX1250-NEXT: s_or_b32 s6, s6, s12 -; GFX1250-NEXT: s_or_b32 s4, s4, s8 -; GFX1250-NEXT: s_or_b32 s5, s5, s9 -; GFX1250-NEXT: s_or_b32 s2, s2, s20 -; GFX1250-NEXT: s_or_b32 s0, s0, s16 -; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 -; GFX1250-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 -; GFX1250-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3 +; GFX1250-NEXT: s_or_b32 s0, s1, s0 +; GFX1250-NEXT: s_or_b32 s1, s5, s2 +; GFX1250-NEXT: s_or_b32 s10, s10, s20 +; GFX1250-NEXT: s_or_b32 s8, s8, s16 +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s6 +; GFX1250-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GFX1250-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX1250-NEXT: global_store_b128 v[10:11], v[4:7], off diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index 51652a0..2ae6fc2 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -117,12 +117,12 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX1250-LABEL: sadd64rr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -818,17 +818,17 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: suaddo64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5] -; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[12:13], s[14:15] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[12:13] +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9] +; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11] ; GFX1250-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %uadd, 0 @@ -1096,12 +1096,12 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) { ; ; GFX1250-LABEL: ssub64rr: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_sub_nc_u64 s[2:3], s[2:3], s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -1798,17 +1798,17 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; ; GFX1250-LABEL: susubo64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_sub_nc_u64 s[6:7], s[4:5], s[6:7] -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1250-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5] -; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX1250-NEXT: s_sub_nc_u64 s[0:1], s[12:13], s[14:15] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[12:13] +; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX1250-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9] +; GFX1250-NEXT: global_store_b8 v2, v3, s[10:11] ; GFX1250-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) %val = extractvalue { i64, i1 } %usub, 0 @@ -3099,70 +3099,70 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GFX1250-LABEL: sudiv64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; GFX1250-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_and_b64 s[6:7], s[6:7], 0xffffffff00000000 -; GFX1250-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1250-NEXT: s_and_b64 s[4:5], s[4:5], 0xffffffff00000000 +; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1250-NEXT: ; %bb.1: -; GFX1250-NEXT: s_cvt_f32_u32 s6, s4 -; GFX1250-NEXT: s_cvt_f32_u32 s7, s5 -; GFX1250-NEXT: s_sub_nc_u64 s[10:11], 0, s[4:5] +; GFX1250-NEXT: s_cvt_f32_u32 s4, s6 +; GFX1250-NEXT: s_cvt_f32_u32 s5, s7 +; GFX1250-NEXT: s_sub_nc_u64 s[10:11], 0, s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_fmac_f32 s6, s7, 0x4f800000 -; GFX1250-NEXT: v_s_rcp_f32 s6, s6 +; GFX1250-NEXT: s_fmac_f32 s4, s5, 0x4f800000 +; GFX1250-NEXT: v_s_rcp_f32 s4, s4 ; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_mul_f32 s6, s6, 0x5f7ffffc -; GFX1250-NEXT: s_mul_f32 s7, s6, 0x2f800000 +; GFX1250-NEXT: s_mul_f32 s4, s4, 0x5f7ffffc +; GFX1250-NEXT: s_mul_f32 s5, s4, 0x2f800000 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_trunc_f32 s7, s7 -; GFX1250-NEXT: s_fmac_f32 s6, s7, 0xcf800000 -; GFX1250-NEXT: s_cvt_u32_f32 s9, s7 -; GFX1250-NEXT: s_mov_b32 s7, 0 +; GFX1250-NEXT: s_trunc_f32 s5, s5 +; GFX1250-NEXT: s_fmac_f32 s4, s5, 0xcf800000 +; GFX1250-NEXT: s_cvt_u32_f32 s9, s5 +; GFX1250-NEXT: s_mov_b32 s5, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_cvt_u32_f32 s8, s6 +; GFX1250-NEXT: s_cvt_u32_f32 s8, s4 ; GFX1250-NEXT: s_mul_u64 s[12:13], s[10:11], s[8:9] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s15, s8, s13 ; GFX1250-NEXT: s_mul_i32 s14, s8, s13 -; GFX1250-NEXT: s_mul_hi_u32 s6, s8, s12 +; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s12 ; GFX1250-NEXT: s_mul_i32 s17, s9, s12 -; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], s[14:15] +; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[4:5], s[14:15] ; GFX1250-NEXT: s_mul_hi_u32 s16, s9, s12 ; GFX1250-NEXT: s_mul_hi_u32 s18, s9, s13 -; GFX1250-NEXT: s_add_co_u32 s6, s14, s17 -; GFX1250-NEXT: s_add_co_ci_u32 s6, s15, s16 +; GFX1250-NEXT: s_add_co_u32 s4, s14, s17 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s15, s16 ; GFX1250-NEXT: s_mul_i32 s12, s9, s13 ; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13] +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s12 -; GFX1250-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-NEXT: s_cmp_lg_u32 s4, 0 ; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13 ; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11 ; GFX1250-NEXT: s_mul_i32 s12, s8, s11 -; GFX1250-NEXT: s_mul_hi_u32 s6, s8, s10 +; GFX1250-NEXT: s_mul_hi_u32 s4, s8, s10 ; GFX1250-NEXT: s_mul_i32 s15, s9, s10 -; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13] +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[4:5], s[12:13] ; GFX1250-NEXT: s_mul_hi_u32 s14, s9, s10 ; GFX1250-NEXT: s_mul_hi_u32 s16, s9, s11 -; GFX1250-NEXT: s_add_co_u32 s6, s12, s15 -; GFX1250-NEXT: s_add_co_ci_u32 s6, s13, s14 +; GFX1250-NEXT: s_add_co_u32 s4, s12, s15 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s13, s14 ; GFX1250-NEXT: s_mul_i32 s10, s9, s11 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s16, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[6:7], s[10:11] +; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[4:5], s[10:11] ; GFX1250-NEXT: s_add_co_u32 s8, s8, s10 ; GFX1250-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1250-NEXT: s_mul_hi_u32 s6, s2, s8 +; GFX1250-NEXT: s_mul_hi_u32 s4, s2, s8 ; GFX1250-NEXT: s_cmp_lg_u32 s10, 0 ; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8 ; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 @@ -3170,33 +3170,33 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10 ; GFX1250-NEXT: s_mul_i32 s8, s2, s10 ; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10 -; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[8:9] +; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[8:9] ; GFX1250-NEXT: s_mul_i32 s10, s3, s10 -; GFX1250-NEXT: s_add_co_u32 s6, s8, s11 -; GFX1250-NEXT: s_add_co_ci_u32 s6, s9, s12 +; GFX1250-NEXT: s_add_co_u32 s4, s8, s11 +; GFX1250-NEXT: s_add_co_ci_u32 s4, s9, s12 ; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[10:11] +; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[4:5], s[10:11] ; GFX1250-NEXT: s_and_b64 s[10:11], s[8:9], 0xffffffff00000000 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_or_b32 s10, s10, s8 -; GFX1250-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] +; GFX1250-NEXT: s_mul_u64 s[8:9], s[6:7], s[10:11] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_sub_co_u32 s6, s2, s8 +; GFX1250-NEXT: s_sub_co_u32 s4, s2, s8 ; GFX1250-NEXT: s_cselect_b32 s8, -1, 0 ; GFX1250-NEXT: s_sub_co_i32 s12, s3, s9 ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 -; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s5 -; GFX1250-NEXT: s_sub_co_u32 s13, s6, s4 +; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s7 +; GFX1250-NEXT: s_sub_co_u32 s13, s4, s6 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_lg_u32 s14, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0 -; GFX1250-NEXT: s_cmp_ge_u32 s12, s5 +; GFX1250-NEXT: s_cmp_ge_u32 s12, s7 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1250-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1250-NEXT: s_cmp_ge_u32 s13, s6 ; GFX1250-NEXT: s_cselect_b32 s15, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s12, s5 +; GFX1250-NEXT: s_cmp_eq_u32 s12, s7 ; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[10:11], 1 ; GFX1250-NEXT: s_cselect_b32 s16, s15, s14 ; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[10:11], 2 @@ -3206,20 +3206,20 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 ; GFX1250-NEXT: s_sub_co_ci_u32 s3, s3, s9 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_ge_u32 s3, s5 +; GFX1250-NEXT: s_cmp_ge_u32 s3, s7 ; GFX1250-NEXT: s_cselect_b32 s8, -1, 0 -; GFX1250-NEXT: s_cmp_ge_u32 s6, s4 -; GFX1250-NEXT: s_cselect_b32 s6, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s3, s5 -; GFX1250-NEXT: s_cselect_b32 s3, s6, s8 +; GFX1250-NEXT: s_cmp_ge_u32 s4, s6 +; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1250-NEXT: s_cmp_eq_u32 s3, s7 +; GFX1250-NEXT: s_cselect_b32 s3, s4, s8 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1250-NEXT: s_cselect_b32 s9, s13, s11 ; GFX1250-NEXT: s_cselect_b32 s8, s12, s10 ; GFX1250-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1250-NEXT: .LBB16_2: -; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX1250-NEXT: s_sub_co_i32 s5, 0, s4 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX1250-NEXT: s_sub_co_i32 s4, 0, s6 ; GFX1250-NEXT: s_mov_b32 s9, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) ; GFX1250-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -3228,23 +3228,23 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1250-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1250-NEXT: s_mul_i32 s5, s5, s3 +; GFX1250-NEXT: s_mul_i32 s4, s4, s3 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_mul_hi_u32 s5, s3, s5 -; GFX1250-NEXT: s_add_co_i32 s3, s3, s5 +; GFX1250-NEXT: s_mul_hi_u32 s4, s3, s4 +; GFX1250-NEXT: s_add_co_i32 s3, s3, s4 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_hi_u32 s3, s2, s3 -; GFX1250-NEXT: s_mul_i32 s5, s3, s4 +; GFX1250-NEXT: s_mul_i32 s4, s3, s6 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_sub_co_i32 s2, s2, s5 -; GFX1250-NEXT: s_add_co_i32 s5, s3, 1 -; GFX1250-NEXT: s_sub_co_i32 s6, s2, s4 -; GFX1250-NEXT: s_cmp_ge_u32 s2, s4 -; GFX1250-NEXT: s_cselect_b32 s3, s5, s3 -; GFX1250-NEXT: s_cselect_b32 s2, s6, s2 -; GFX1250-NEXT: s_add_co_i32 s5, s3, 1 -; GFX1250-NEXT: s_cmp_ge_u32 s2, s4 -; GFX1250-NEXT: s_cselect_b32 s8, s5, s3 +; GFX1250-NEXT: s_sub_co_i32 s2, s2, s4 +; GFX1250-NEXT: s_add_co_i32 s4, s3, 1 +; GFX1250-NEXT: s_sub_co_i32 s5, s2, s6 +; GFX1250-NEXT: s_cmp_ge_u32 s2, s6 +; GFX1250-NEXT: s_cselect_b32 s3, s4, s3 +; GFX1250-NEXT: s_cselect_b32 s2, s5, s2 +; GFX1250-NEXT: s_add_co_i32 s4, s3, 1 +; GFX1250-NEXT: s_cmp_ge_u32 s2, s6 +; GFX1250-NEXT: s_cselect_b32 s8, s4, s3 ; GFX1250-NEXT: .LBB16_3: ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir index 029aa39..c475efb 100644 --- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir +++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir @@ -128,13 +128,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0 %2.sub2_sub3:areg_128 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -153,13 +153,13 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY2]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY2]] ; CHECK-NEXT: SI_RETURN %0:vreg_64 = COPY $vgpr0_vgpr1 %1:vreg_64 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0 %2.sub2_sub3:areg_128_align2 = COPY %1 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -398,14 +398,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0 %1.sub1:areg_128 = COPY %0 %1.sub2:areg_128 = COPY %0 %1.sub3:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -425,14 +425,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]] ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0 %1.sub1:areg_128_align2 = COPY %0 %1.sub2:areg_128_align2 = COPY %0 %1.sub3:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -504,7 +504,7 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, [[COPY1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3670025 /* reguse:VS_64_with_sub1 */, [[COPY]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:VReg_64 */, [[COPY]] ; CHECK-NEXT: SI_RETURN %0:vgpr_32 = COPY $vgpr0 undef %1.sub0:areg_64 = COPY %0 @@ -512,7 +512,7 @@ body: | undef %2.sub0:vreg_64 = COPY %0 %2.sub1:vreg_64 = COPY %0 INLINEASM &"; use $0", 0 /* attdialect */, 4325385 /* reguse:AReg_64 */, killed %1 - INLINEASM &"; use $0", 0 /* attdialect */, 3670025 /* reguse:VReg_64 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:VReg_64 */, killed %2 SI_RETURN ... @@ -641,13 +641,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -668,13 +668,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1 %0.sub1:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0 %2.sub2_sub3:areg_128_align2 = COPY %0.sub1 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -890,14 +890,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128 = COPY %0.sub0 %1.sub1:areg_128 = COPY %0.sub0 %1.sub2:areg_128 = COPY %0.sub0 %1.sub3:areg_128 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %1 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %1 SI_RETURN ... @@ -917,14 +917,14 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0:vreg_64 = COPY $vgpr0 undef %1.sub0:areg_128_align2 = COPY %0.sub0 %1.sub1:areg_128_align2 = COPY %0.sub0 %1.sub2:areg_128_align2 = COPY %0.sub0 %1.sub3:areg_128_align2 = COPY %0.sub0 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %1 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %1 SI_RETURN ... @@ -1051,13 +1051,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1076,13 +1076,13 @@ body: | ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3 - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1 %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3 undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1 %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... @@ -1358,11 +1358,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8323081 /* reguse:AReg_128 */, killed %2 + INLINEASM &"; use $0", 0 /* attdialect */, 8847369 /* reguse:AReg_128 */, killed %2 SI_RETURN ... @@ -1379,11 +1379,11 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]] - ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY1]] + ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY1]] ; CHECK-NEXT: SI_RETURN %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %2:areg_128_align2 = COPY %0 - INLINEASM &"; use $0", 0 /* attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %2 + INLINEASM &"; use $0", 0 /* attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %2 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index be60a00..0cae0e5 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -705,12 +705,13 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[0:1] scale_offset -; GFX1250-NEXT: global_load_b32 v0, v0, s[2:3] scale_offset -; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX1250-NEXT: global_load_b32 v2, v0, s[2:3] scale_offset +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s5 ; GFX1250-NEXT: s_wait_loadcnt 0x1 -; GFX1250-NEXT: ds_store_b32 v2, v1 offset:32 +; GFX1250-NEXT: ds_store_b32 v0, v1 offset:32 ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: ds_store_b32 v3, v0 offset:32 +; GFX1250-NEXT: ds_store_b32 v3, v2 offset:32 ; GFX1250-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in0.gep = getelementptr float, ptr addrspace(1) %in0, i32 %x.i @@ -1282,14 +1283,14 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3) ; ; GFX1250-LABEL: simple_write2_v4f32_superreg_align4: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s8, s[4:5], 0x0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX1250-NEXT: s_load_b128 s[0:3], s[6:7], 0x0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 4, s4 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 4, s8 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3 ; GFX1250-NEXT: v_dual_mov_b32 v3, s0 :: v_dual_mov_b32 v4, s1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 1e7855c..eefc781 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -541,11 +541,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -570,9 +569,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -586,14 +584,13 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -618,10 +615,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -727,13 +723,12 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -754,9 +749,8 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: s_clause 0x1 @@ -770,8 +764,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -780,7 +773,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -805,10 +798,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off @@ -917,11 +909,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -943,9 +934,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off @@ -953,15 +943,14 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3 @@ -982,10 +971,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -1069,11 +1057,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3 @@ -1094,9 +1080,8 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off @@ -1104,8 +1089,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1115,7 +1099,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3 @@ -1136,10 +1120,9 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -1400,11 +1383,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1429,9 +1411,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1445,14 +1426,13 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1477,10 +1457,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1590,13 +1569,12 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1617,9 +1595,8 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1633,8 +1610,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -1643,7 +1619,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -1668,10 +1644,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1784,11 +1759,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -1810,9 +1784,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1823,15 +1796,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3 @@ -1852,10 +1824,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -1950,11 +1921,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3 @@ -1975,9 +1944,8 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -1988,8 +1956,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -1999,7 +1966,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3 @@ -2020,10 +1987,9 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2295,11 +2261,10 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2324,9 +2289,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2340,14 +2304,13 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2372,10 +2335,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2487,13 +2449,12 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2514,9 +2475,8 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2530,8 +2490,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -2540,7 +2499,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -2565,10 +2524,9 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2683,11 +2641,10 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -2709,9 +2666,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2722,15 +2678,14 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3 @@ -2751,10 +2706,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -2851,11 +2805,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3 @@ -2876,9 +2828,8 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -2889,8 +2840,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -2900,7 +2850,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3 @@ -2921,10 +2871,9 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3198,11 +3147,10 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -3227,9 +3175,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3244,14 +3191,13 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3276,10 +3222,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3390,13 +3335,12 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3417,9 +3361,8 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3434,8 +3377,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -3444,7 +3386,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -3469,10 +3411,9 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3586,11 +3527,10 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -3612,9 +3552,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3626,15 +3565,14 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3 @@ -3655,10 +3593,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -3754,11 +3691,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3 @@ -3779,9 +3714,8 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -3793,8 +3727,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -3804,7 +3737,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3 @@ -3825,10 +3758,9 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4101,11 +4033,10 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -4130,9 +4061,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4147,14 +4077,13 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4179,10 +4108,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4293,13 +4221,12 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4320,9 +4247,8 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4337,8 +4263,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -4347,7 +4272,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -4372,10 +4297,9 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4489,11 +4413,10 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -4515,9 +4438,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4529,15 +4451,14 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3 @@ -4558,10 +4479,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -4657,11 +4577,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3 @@ -4682,9 +4600,8 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -4696,8 +4613,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -4707,7 +4623,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3 @@ -4728,10 +4644,9 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5004,11 +4919,10 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5033,9 +4947,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5050,14 +4963,13 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5082,10 +4994,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5196,13 +5107,12 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5223,9 +5133,8 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5240,8 +5149,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -5250,7 +5158,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5275,10 +5183,9 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5392,11 +5299,10 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -5418,9 +5324,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5432,15 +5337,14 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3 @@ -5461,10 +5365,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5560,11 +5463,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3 @@ -5585,9 +5486,8 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5599,8 +5499,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -5610,7 +5509,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3 @@ -5631,10 +5530,9 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -5877,11 +5775,10 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -5902,9 +5799,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -5918,14 +5814,13 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -5946,10 +5841,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6061,13 +5955,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -6084,9 +5977,8 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6100,8 +5992,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -6110,7 +6001,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6131,10 +6022,9 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6249,11 +6139,10 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -6272,9 +6161,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6285,15 +6173,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3 @@ -6311,10 +6198,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6409,11 +6295,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3 @@ -6431,9 +6315,8 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6444,8 +6327,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -6455,7 +6337,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3 @@ -6473,10 +6355,9 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6718,11 +6599,10 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -6743,9 +6623,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6759,14 +6638,13 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6787,10 +6665,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -6902,13 +6779,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -6925,9 +6801,8 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -6941,8 +6816,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -6951,7 +6825,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -6972,10 +6846,9 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7090,11 +6963,10 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -7113,9 +6985,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7126,15 +6997,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3 @@ -7152,10 +7022,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7250,11 +7119,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3 @@ -7272,9 +7139,8 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7285,8 +7151,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -7296,7 +7161,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3 @@ -7314,10 +7179,9 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7559,11 +7423,10 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -7584,9 +7447,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7600,14 +7462,13 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -7628,10 +7489,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7743,13 +7603,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -7766,9 +7625,8 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7782,8 +7640,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -7792,7 +7649,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -7813,10 +7670,9 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -7931,11 +7787,10 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -7954,9 +7809,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -7967,15 +7821,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3 @@ -7993,10 +7846,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8091,11 +7943,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3 @@ -8113,9 +7963,8 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8126,8 +7975,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8137,7 +7985,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3 @@ -8155,10 +8003,9 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8400,11 +8247,10 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -8425,9 +8271,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8441,14 +8286,13 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -8469,10 +8313,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8584,13 +8427,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -8607,9 +8449,8 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8623,8 +8464,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -8633,7 +8473,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -8654,10 +8494,9 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8772,11 +8611,10 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -8795,9 +8633,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8808,15 +8645,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3 @@ -8834,10 +8670,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -8932,11 +8767,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3 @@ -8954,9 +8787,8 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -8967,8 +8799,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -8978,7 +8809,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3 @@ -8996,10 +8827,9 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9281,12 +9111,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -9311,9 +9140,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off @@ -9328,15 +9156,14 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -9361,10 +9188,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9485,13 +9311,12 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -9512,9 +9337,8 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off @@ -9529,10 +9353,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -9540,7 +9363,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v3, v10 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v3 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -9565,10 +9388,9 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9690,13 +9512,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -9718,9 +9539,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off @@ -9732,15 +9552,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3 @@ -9761,10 +9581,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -9869,11 +9688,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3 @@ -9894,9 +9711,8 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off @@ -9908,18 +9724,18 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, src_flat_scratch_base_hi :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v1, v3, v5 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3 @@ -9940,10 +9756,9 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -10188,11 +10003,10 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -10214,10 +10028,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10233,14 +10046,13 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -10262,18 +10074,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -10386,13 +10197,12 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -10410,10 +10220,9 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10429,8 +10238,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -10439,7 +10247,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -10461,18 +10269,17 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -10588,11 +10395,10 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -10610,9 +10416,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10625,15 +10430,14 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3 @@ -10650,17 +10454,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -10754,11 +10557,9 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3 @@ -10775,9 +10576,8 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -10790,8 +10590,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10801,7 +10600,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3 @@ -10818,17 +10617,16 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off ; GFX1250-GISEL-NEXT: s_endpgm @@ -11064,11 +10862,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -11090,10 +10887,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -11112,14 +10908,13 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -11141,11 +10936,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11268,13 +11062,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -11292,10 +11085,9 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off @@ -11314,8 +11106,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 @@ -11324,7 +11115,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, src_flat_scratch_base_hi, v7 bitop3:0x14 ; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo @@ -11346,11 +11137,10 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11476,11 +11266,10 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -11498,9 +11287,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -11516,15 +11304,14 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3 @@ -11541,10 +11328,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11648,11 +11434,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, src_flat_scratch_base_hi, v1 ; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3 @@ -11669,9 +11453,8 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, src_flat_scratch_base_lo, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -11687,8 +11470,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -11698,7 +11480,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, src_flat_scratch_base_hi, v3 ; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3 @@ -11715,10 +11497,9 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -11834,12 +11615,10 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global @@ -11855,9 +11634,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB110_4: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -11891,10 +11668,9 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_6 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_3 @@ -11910,9 +11686,7 @@ define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_5 ; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -12060,12 +11834,10 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi ; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] ; GFX1250-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 ; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_5 @@ -12079,9 +11851,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 ; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 ; GFX1250-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -12112,9 +11882,8 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_3 @@ -12130,9 +11899,7 @@ define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_5 ; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 @@ -12261,9 +12028,8 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12279,10 +12045,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB112_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12301,10 +12066,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB112_2 @@ -12320,10 +12084,9 @@ define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB112_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -12413,9 +12176,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12434,11 +12196,10 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2 ; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 -; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12454,9 +12215,8 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 @@ -12473,10 +12233,9 @@ define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB113_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -12562,9 +12321,8 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12580,10 +12338,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: .LBB114_2: ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo -; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12602,10 +12359,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB114_2 @@ -12621,10 +12377,9 @@ define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB114_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 @@ -12714,9 +12469,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -12735,11 +12489,10 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2 ; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 -; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 ; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -12755,9 +12508,8 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 ; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 -; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 ; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 ; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 @@ -12774,10 +12526,9 @@ define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { ; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB115_4 ; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo -; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 ; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index 2079543..b5b2655 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -2179,6 +2179,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX1250-SDAG-NEXT: ; kill: killed $sgpr4_sgpr5 ; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1 ; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 ; GFX1250-SDAG-NEXT: s_endpgm @@ -2190,15 +2191,16 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3 ; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 4, v[2:3] +; GFX1250-GISEL-NEXT: ; kill: killed $vgpr4 killed $vgpr5 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 ; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1 diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll index 4827f75..5e6de6d 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll @@ -162,32 +162,32 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmax3_olt_0_f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 -; GFX1250-NEXT: s_mov_b32 s22, s10 -; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 +; GFX1250-NEXT: s_mov_b32 s22, s2 +; GFX1250-NEXT: s_mov_b32 s23, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: s_mov_b32 s20, s6 -; GFX1250-NEXT: s_mov_b32 s21, s7 -; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: s_mov_b32 s20, s14 +; GFX1250-NEXT: s_mov_b32 s21, s15 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_max3_num_f32 v0, v0, v1, v2 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -352,32 +352,32 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmax3_olt_1_f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 -; GFX1250-NEXT: s_mov_b32 s22, s10 -; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 +; GFX1250-NEXT: s_mov_b32 s22, s2 +; GFX1250-NEXT: s_mov_b32 s23, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: s_mov_b32 s20, s6 -; GFX1250-NEXT: s_mov_b32 s21, s7 -; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: s_mov_b32 s20, s14 +; GFX1250-NEXT: s_mov_b32 s21, s15 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_max3_num_f32 v0, v2, v0, v1 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -609,62 +609,62 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16: ; GFX1250-TRUE16: ; %bb.0: -; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l -; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16: ; GFX1250-FAKE16: ; %bb.0: -; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v0, v1, v2 -; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 @@ -897,62 +897,62 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16: ; GFX1250-TRUE16: ; %bb.0: -; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-TRUE16-NEXT: v_max3_num_f16 v0.l, v0.l, v1.l, v2.l -; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16: ; GFX1250-FAKE16: ; %bb.0: -; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-FAKE16-NEXT: v_max3_num_f16 v0, v2, v0, v1 -; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll index 6dfefd8..6a6f232 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll @@ -162,32 +162,32 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_0_f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 -; GFX1250-NEXT: s_mov_b32 s22, s10 -; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 +; GFX1250-NEXT: s_mov_b32 s22, s2 +; GFX1250-NEXT: s_mov_b32 s23, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: s_mov_b32 s20, s6 -; GFX1250-NEXT: s_mov_b32 s21, s7 -; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: s_mov_b32 s20, s14 +; GFX1250-NEXT: s_mov_b32 s21, s15 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_min3_num_f32 v0, v0, v1, v2 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -352,32 +352,32 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_1_f32: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 -; GFX1250-NEXT: s_mov_b32 s22, s10 -; GFX1250-NEXT: s_mov_b32 s23, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 +; GFX1250-NEXT: s_mov_b32 s22, s2 +; GFX1250-NEXT: s_mov_b32 s23, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: s_mov_b32 s20, s6 -; GFX1250-NEXT: s_mov_b32 s21, s7 -; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: s_mov_b32 s20, s14 +; GFX1250-NEXT: s_mov_b32 s21, s15 +; GFX1250-NEXT: buffer_load_b32 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b32 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_min3_num_f32 v0, v2, v0, v1 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile float, ptr addrspace(1) %aptr, align 4 %b = load volatile float, ptr addrspace(1) %bptr, align 4 @@ -609,62 +609,62 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmin3_olt_0_f16: ; GFX1250-TRUE16: ; %bb.0: -; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l -; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: test_fmin3_olt_0_f16: ; GFX1250-FAKE16: ; %bb.0: -; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v0, v1, v2 -; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 @@ -897,62 +897,62 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-TRUE16-LABEL: test_fmin3_olt_1_f16: ; GFX1250-TRUE16: ; %bb.0: -; GFX1250-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-TRUE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-TRUE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-TRUE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-TRUE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-TRUE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-TRUE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-TRUE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-TRUE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-TRUE16-NEXT: buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-TRUE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-TRUE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-TRUE16-NEXT: v_min3_num_f16 v0.l, v0.l, v1.l, v2.l -; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-TRUE16-NEXT: s_endpgm ; ; GFX1250-FAKE16-LABEL: test_fmin3_olt_1_f16: ; GFX1250-FAKE16: ; %bb.0: -; GFX1250-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1250-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-FAKE16-NEXT: s_mov_b32 s14, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s15, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s11 -; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s10 -; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX1250-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s7, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s18, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX1250-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX1250-FAKE16-NEXT: s_mov_b32 s23, s3 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s12, s2 -; GFX1250-FAKE16-NEXT: s_mov_b32 s13, s3 -; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s4 -; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s5 -; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s6 -; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s7 -; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS +; GFX1250-FAKE16-NEXT: s_mov_b32 s4, s10 +; GFX1250-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1250-FAKE16-NEXT: s_mov_b32 s16, s12 +; GFX1250-FAKE16-NEXT: s_mov_b32 s17, s13 +; GFX1250-FAKE16-NEXT: s_mov_b32 s20, s14 +; GFX1250-FAKE16-NEXT: s_mov_b32 s21, s15 +; GFX1250-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX1250-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS ; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s8, s0 -; GFX1250-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX1250-FAKE16-NEXT: s_mov_b32 s0, s8 +; GFX1250-FAKE16-NEXT: s_mov_b32 s1, s9 ; GFX1250-FAKE16-NEXT: v_min3_num_f16 v0, v2, v0, v1 -; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null +; GFX1250-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null ; GFX1250-FAKE16-NEXT: s_endpgm %a = load volatile half, ptr addrspace(1) %aptr, align 2 %b = load volatile half, ptr addrspace(1) %bptr, align 2 @@ -1217,36 +1217,36 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_0_f64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: s_mov_b32 s12, s6 -; GFX1250-NEXT: s_mov_b32 s13, s7 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s14 +; GFX1250-NEXT: s_mov_b32 s5, s15 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 @@ -1427,36 +1427,36 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs ; ; GFX1250-LABEL: test_fmin3_olt_1_f64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s18, s10 -; GFX1250-NEXT: s_mov_b32 s19, s11 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s7, s3 +; GFX1250-NEXT: s_mov_b32 s18, s2 +; GFX1250-NEXT: s_mov_b32 s19, s3 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s12, s2 -; GFX1250-NEXT: s_mov_b32 s13, s3 -; GFX1250-NEXT: s_mov_b32 s16, s4 -; GFX1250-NEXT: s_mov_b32 s17, s5 -; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s10 +; GFX1250-NEXT: s_mov_b32 s5, s11 +; GFX1250-NEXT: s_mov_b32 s16, s12 +; GFX1250-NEXT: s_mov_b32 s17, s13 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[16:19], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_xcnt 0x1 -; GFX1250-NEXT: s_mov_b32 s12, s6 -; GFX1250-NEXT: s_mov_b32 s13, s7 -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[12:15], null scope:SCOPE_SYS +; GFX1250-NEXT: s_mov_b32 s4, s14 +; GFX1250-NEXT: s_mov_b32 s5, s15 +; GFX1250-NEXT: s_mov_b32 s0, s8 +; GFX1250-NEXT: buffer_load_b64 v[4:5], off, s[4:7], null scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: s_mov_b32 s1, s9 ; GFX1250-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] ; GFX1250-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5] ; GFX1250-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] -; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_endpgm %a = load volatile double, ptr addrspace(1) %aptr, align 4 %b = load volatile double, ptr addrspace(1) %bptr, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index e532dea..f807169 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -11,22 +11,20 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG: ; %bb.0: ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 -; GCN-SDAG-NEXT: global_load_u8 v2, v[2:3], off -; GCN-SDAG-NEXT: global_load_u8 v3, v[4:5], off -; GCN-SDAG-NEXT: global_load_u8 v0, v[0:1], off +; GCN-SDAG-NEXT: global_load_u8 v6, v[2:3], off +; GCN-SDAG-NEXT: global_load_u8 v7, v[4:5], off +; GCN-SDAG-NEXT: global_load_u8 v10, v[0:1], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x2 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 -; GCN-SDAG-NEXT: v_lshlrev_b16 v1, 8, v2 +; GCN-SDAG-NEXT: v_lshlrev_b16 v0, 8, v6 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x1 -; GCN-SDAG-NEXT: v_lshlrev_b16 v2, 8, v3 +; GCN-SDAG-NEXT: v_lshlrev_b16 v1, 8, v7 +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-SDAG-NEXT: v_or_b32_e32 v1, v7, v1 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GCN-SDAG-NEXT: v_or_b32_e32 v1, v3, v2 -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GCN-SDAG-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_bitop2_b32 v0, v10, v0 bitop3:0x54 +; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-SDAG-NEXT: global_store_b32 v[8:9], v0, off ; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] @@ -35,13 +33,15 @@ define void @test_i8load_v4i8store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL: ; %bb.0: ; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 -; GCN-GISEL-NEXT: global_load_u8 v0, v[0:1], off -; GCN-GISEL-NEXT: global_load_u8 v1, v[2:3], off -; GCN-GISEL-NEXT: global_load_u8 v2, v[4:5], off +; GCN-GISEL-NEXT: global_load_u8 v6, v[0:1], off +; GCN-GISEL-NEXT: global_load_u8 v7, v[2:3], off +; GCN-GISEL-NEXT: global_load_u8 v10, v[4:5], off ; GCN-GISEL-NEXT: s_wait_loadcnt 0x1 -; GCN-GISEL-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; GCN-GISEL-NEXT: s_wait_xcnt 0x2 +; GCN-GISEL-NEXT: v_lshl_or_b32 v0, v7, 8, v6 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: v_dual_lshlrev_b32 v1, 16, v2 :: v_dual_lshlrev_b32 v2, 24, v2 +; GCN-GISEL-NEXT: s_wait_xcnt 0x1 +; GCN-GISEL-NEXT: v_dual_lshlrev_b32 v1, 16, v10 :: v_dual_lshlrev_b32 v2, 24, v10 ; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-GISEL-NEXT: v_or3_b32 v0, v0, v1, v2 ; GCN-GISEL-NEXT: global_store_b32 v[8:9], v0, off @@ -64,21 +64,21 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2 ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: global_load_b128 v[4:7], v[0:1], off -; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[2:3], off -; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 0 -; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: v_pk_add_u16 v10, v6, v2 -; GCN-SDAG-NEXT: v_pk_add_u16 v11, v7, v3 +; GCN-SDAG-NEXT: global_load_b128 v[8:11], v[2:3], off ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 12 +; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 +; GCN-SDAG-NEXT: v_pk_add_u16 v1, v6, v10 +; GCN-SDAG-NEXT: v_pk_add_u16 v12, v7, v11 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[6:7], 8 -; GCN-SDAG-NEXT: v_pk_add_u16 v4, v4, v0 -; GCN-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; GCN-SDAG-NEXT: v_pk_add_u16 v5, v5, v1 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 0 +; GCN-SDAG-NEXT: v_pk_add_u16 v5, v5, v9 +; GCN-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GCN-SDAG-NEXT: v_pk_add_u16 v4, v4, v8 ; GCN-SDAG-NEXT: s_clause 0x2 -; GCN-SDAG-NEXT: global_store_b16 v[2:3], v11, off -; GCN-SDAG-NEXT: global_store_b32 v[6:7], v10, off -; GCN-SDAG-NEXT: global_store_b64 v[8:9], v[4:5], off +; GCN-SDAG-NEXT: global_store_b16 v[2:3], v12, off +; GCN-SDAG-NEXT: global_store_b32 v[6:7], v1, off +; GCN-SDAG-NEXT: global_store_b64 v[10:11], v[4:5], off ; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] ; ; GCN-GISEL-LABEL: test_v7i16_load_store: @@ -86,28 +86,29 @@ define i16 @test_v7i16_load_store(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2 ; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: global_load_b128 v[4:7], v[0:1], off -; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[2:3], off -; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 8 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 10 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12 +; GCN-GISEL-NEXT: global_load_b128 v[8:11], v[2:3], off +; GCN-GISEL-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 2 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 4 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 6 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 8 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 10 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[22:23], 12 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: v_pk_add_u16 v2, v6, v2 -; GCN-GISEL-NEXT: v_pk_add_u16 v4, v4, v0 -; GCN-GISEL-NEXT: v_pk_add_u16 v1, v5, v1 -; GCN-GISEL-NEXT: v_pk_add_u16 v3, v7, v3 +; GCN-GISEL-NEXT: v_pk_add_u16 v1, v6, v10 +; GCN-GISEL-NEXT: v_pk_add_u16 v4, v4, v8 +; GCN-GISEL-NEXT: v_pk_add_u16 v5, v5, v9 +; GCN-GISEL-NEXT: v_pk_add_u16 v6, v7, v11 ; GCN-GISEL-NEXT: s_clause 0x6 -; GCN-GISEL-NEXT: global_store_b16 v[8:9], v4, off -; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[10:11], v4, off -; GCN-GISEL-NEXT: global_store_b16 v[12:13], v1, off -; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[14:15], v1, off -; GCN-GISEL-NEXT: global_store_b16 v[16:17], v2, off -; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[18:19], v2, off -; GCN-GISEL-NEXT: global_store_b16 v[20:21], v3, off -; GCN-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-GISEL-NEXT: global_store_b16 v[2:3], v4, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[12:13], v4, off +; GCN-GISEL-NEXT: global_store_b16 v[14:15], v5, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[16:17], v5, off +; GCN-GISEL-NEXT: global_store_b16 v[18:19], v1, off +; GCN-GISEL-NEXT: global_store_d16_hi_b16 v[20:21], v1, off +; GCN-GISEL-NEXT: global_store_b16 v[22:23], v6, off +; GCN-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] %vec1 = load <7 x i16>, ptr addrspace(1) %ptr1 %insert = insertelement <7 x i16> %vec1, i16 20, i32 4 @@ -253,8 +254,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: global_load_b128 v[22:25], v[0:1], off offset:32 ; GCN-SDAG-NEXT: global_load_b128 v[26:29], v[0:1], off offset:16 ; GCN-SDAG-NEXT: global_load_b128 v[30:33], v[0:1], off -; GCN-SDAG-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x70 +; GCN-SDAG-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x70 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[48:49], 48 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[38:39], 0x60 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[50:51], 32 @@ -262,14 +263,15 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: v_mov_b64_e32 v[66:67], 0 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[52:53], 0x50 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[54:55], 64 -; GCN-SDAG-NEXT: v_dual_mov_b32 v34, 0xc8 :: v_dual_mov_b32 v35, 0 +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_dual_mov_b32 v0, 0xc8 :: v_dual_mov_b32 v1, 0 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x7 -; GCN-SDAG-NEXT: global_store_b128 v[36:37], v[6:9], off +; GCN-SDAG-NEXT: global_store_b128 v[2:3], v[6:9], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x6 ; GCN-SDAG-NEXT: global_store_b128 v[38:39], v[10:13], off ; GCN-SDAG-NEXT: s_wait_loadcnt 0x5 ; GCN-SDAG-NEXT: s_wait_xcnt 0x1 -; GCN-SDAG-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v37, v17 +; GCN-SDAG-NEXT: v_dual_mov_b32 v2, v16 :: v_dual_mov_b32 v3, v17 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11] @@ -286,8 +288,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[6:7], v[6:7], v[6:7] ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3] -; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[50:51], v[36:37], v[36:37] +; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[48:49], v[34:35], v[34:35] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[16:17], v[16:17], v[16:17] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[14:15], 0xc8, v[14:15] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[24:25], 0x64, v[24:25] @@ -298,8 +300,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[20:21], v[20:21], v[20:21] ; GCN-SDAG-NEXT: v_add_nc_u64_e32 v[18:19], v[18:19], v[18:19] ; GCN-SDAG-NEXT: s_clause 0x1 -; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[34:37], off -; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[0:3], off +; GCN-SDAG-NEXT: global_store_b128 v[52:53], v[0:3], off +; GCN-SDAG-NEXT: global_store_b128 v[54:55], v[34:37], off ; GCN-SDAG-NEXT: s_clause 0x7 ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:96 ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off offset:112 @@ -309,7 +311,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48 ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[30:33], off ; GCN-SDAG-NEXT: global_store_b128 v[4:5], v[26:29], off offset:16 -; GCN-SDAG-NEXT: s_wait_xcnt 0x8 +; GCN-SDAG-NEXT: s_wait_xcnt 0x9 ; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v32 :: v_dual_mov_b32 v1, v33 ; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] ; @@ -325,7 +327,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: global_load_b128 v[22:25], v[0:1], off offset:48 ; GCN-GISEL-NEXT: global_load_b128 v[26:29], v[0:1], off offset:96 ; GCN-GISEL-NEXT: global_load_b128 v[30:33], v[0:1], off offset:112 -; GCN-GISEL-NEXT: global_load_b128 v[0:3], v[0:1], off offset:64 +; GCN-GISEL-NEXT: global_load_b128 v[34:37], v[0:1], off offset:64 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[38:39], 0 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[48:49], 16 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[50:51], 32 @@ -333,7 +335,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: v_mov_b64_e32 v[66:67], 0x60 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[68:69], 0x70 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[54:55], 64 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[34:35], 0xc8 +; GCN-GISEL-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[0:1], 0xc8 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[64:65], 0x50 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x6 ; GCN-GISEL-NEXT: global_store_b128 v[38:39], v[10:13], off @@ -349,7 +352,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: global_store_b128 v[68:69], v[30:33], off ; GCN-GISEL-NEXT: s_wait_xcnt 0x5 ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[12:13], v[12:13], v[12:13] -; GCN-GISEL-NEXT: v_mov_b64_e32 v[36:37], v[8:9] +; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], v[8:9] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[10:11], v[10:11], v[10:11] ; GCN-GISEL-NEXT: s_wait_xcnt 0x4 ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[14:15], v[14:15], v[14:15] @@ -361,8 +364,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[22:23], v[22:23], v[22:23] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[24:25], v[24:25], v[24:25] ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 -; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[48:49], v[0:1], v[0:1] -; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[50:51], v[2:3], v[2:3] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[48:49], v[34:35], v[34:35] +; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[50:51], v[36:37], v[36:37] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[6:7], 0xc8, v[6:7] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[8:9] ; GCN-GISEL-NEXT: s_wait_xcnt 0x1 @@ -372,8 +375,8 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[30:31], v[30:31], v[30:31] ; GCN-GISEL-NEXT: v_add_nc_u64_e32 v[32:33], v[32:33], v[32:33] ; GCN-GISEL-NEXT: s_clause 0x1 -; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[0:3], off -; GCN-GISEL-NEXT: global_store_b128 v[64:65], v[34:37], off +; GCN-GISEL-NEXT: global_store_b128 v[54:55], v[34:37], off +; GCN-GISEL-NEXT: global_store_b128 v[64:65], v[0:3], off ; GCN-GISEL-NEXT: s_clause 0x7 ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:16 @@ -383,7 +386,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off offset:80 ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[26:29], off offset:96 ; GCN-GISEL-NEXT: global_store_b128 v[4:5], v[30:33], off offset:112 -; GCN-GISEL-NEXT: s_wait_xcnt 0x9 +; GCN-GISEL-NEXT: s_wait_xcnt 0x8 ; GCN-GISEL-NEXT: v_dual_mov_b32 v0, v12 :: v_dual_mov_b32 v1, v13 ; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] %a = load <16 x i64>, ptr addrspace(1) %ptr_a, align 4 @@ -402,16 +405,17 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-SDAG-LABEL: test_v7i16_load_store_kernel: ; GCN-SDAG: ; %bb.0: ; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GCN-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GCN-SDAG-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 ; GCN-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 -; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 8 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[12:13], 0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_clause 0x1 -; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset -; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset +; GCN-SDAG-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset +; GCN-SDAG-NEXT: global_load_b128 v[4:7], v8, s[2:3] scale_offset +; GCN-SDAG-NEXT: s_wait_xcnt 0x0 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12 ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 ; GCN-SDAG-NEXT: v_pk_add_u16 v3, v3, v7 ; GCN-SDAG-NEXT: v_pk_add_u16 v2, v2, v6 @@ -428,10 +432,9 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-GISEL-LABEL: test_v7i16_load_store_kernel: ; GCN-GISEL: ; %bb.0: ; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GCN-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GCN-GISEL-NEXT: v_and_b32_e32 v8, 0x3ff, v0 ; GCN-GISEL-NEXT: s_wait_xcnt 0x0 ; GCN-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 -; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6 @@ -440,8 +443,10 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0x1 -; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset -; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset +; GCN-GISEL-NEXT: global_load_b128 v[0:3], v8, s[0:1] scale_offset +; GCN-GISEL-NEXT: global_load_b128 v[4:7], v8, s[2:3] scale_offset +; GCN-GISEL-NEXT: s_wait_xcnt 0x0 +; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 ; GCN-GISEL-NEXT: v_pk_add_u16 v0, v0, v4 ; GCN-GISEL-NEXT: v_pk_add_u16 v1, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir index 92836d8..63db24a 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-reg-class-vgpr-mfma-to-av-with-load-source.mir @@ -486,7 +486,7 @@ body: | ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_ENDPGM 0 bb.0: S_NOP 0, implicit-def $agpr0 @@ -516,7 +516,7 @@ body: | S_NOP 0, implicit-def $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 S_NOP 0, implicit-def $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 S_NOP 0, implicit-def $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 - INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 + INLINEASM &"; use $0 ", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %0:vreg_512_align2 S_ENDPGM 0 ... @@ -1368,7 +1368,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1408,7 +1408,7 @@ body: | undef %2.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) early-clobber %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %2, 0, 0, 0, implicit $mode, implicit $exec early-clobber %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 @@ -1726,7 +1726,7 @@ body: | ; CHECK-NEXT: renamable $vgpr0_vgpr1 = GLOBAL_LOAD_DWORDX2 undef renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s64), addrspace 1) ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 = V_MFMA_F32_32X32X8F16_vgprcd_e64 $vgpr16_vgpr17, $vgpr16_vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, renamable $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33 ; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc ; CHECK-NEXT: S_BRANCH %bb.2 ; CHECK-NEXT: {{ $}} @@ -1763,7 +1763,7 @@ body: | undef %0.sub0_sub1:vreg_512_align2 = GLOBAL_LOAD_DWORDX2 undef %3:vreg_64_align2, 0, 0, implicit $exec :: (load (s64), addrspace 1) %0:vreg_512_align2 = V_MFMA_F32_32X32X8F16_mac_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec %4:vreg_512_align2 = V_MFMA_F32_32X32X8F16_vgprcd_e64 %1, %1, %0, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 39190537 /* reguse:VReg_512_Align2 */, %4 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40239113 /* reguse:VReg_512_Align2 */, %4 S_CBRANCH_VCCNZ %bb.1, implicit $vcc S_BRANCH %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll index 9cbdc38..5b3e486 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @s_input_output_i128() { ; GFX908-LABEL: name: s_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: s_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9633802 /* regdef:SGPR_128 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10682378 /* regdef:SGPR_128 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9633801 /* reguse:SGPR_128 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10682377 /* reguse:SGPR_128 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=s"() call void asm sideeffect "; use $0", "s"(i128 %val) @@ -27,16 +27,16 @@ define amdgpu_kernel void @s_input_output_i128() { define amdgpu_kernel void @v_input_output_i128() { ; GFX908-LABEL: name: v_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7798793 /* reguse:VReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: v_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7995401 /* reguse:VReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8257545 /* reguse:VReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = tail call i128 asm sideeffect "; def $0", "=v"() call void asm sideeffect "; use $0", "v"(i128 %val) @@ -47,16 +47,16 @@ define amdgpu_kernel void @a_input_output_i128() { ; GFX908-LABEL: name: a_input_output_i128 ; GFX908: bb.0 (%ir-block.0): - ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8323082 /* regdef:AReg_128 */, def %13 + ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8847370 /* regdef:AReg_128 */, def %13 ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13 - ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8323081 /* reguse:AReg_128 */, [[COPY]] + ; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8847369 /* reguse:AReg_128 */, [[COPY]] ; GFX908-NEXT: S_ENDPGM 0 ; ; GFX90A-LABEL: name: a_input_output_i128 ; GFX90A: bb.0 (%ir-block.0): - ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8650762 /* regdef:AReg_128_Align2 */, def %11 + ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 9568266 /* regdef:AReg_128_Align2 */, def %11 ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11 - ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY]] + ; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY]] ; GFX90A-NEXT: S_ENDPGM 0 %val = call i128 asm sideeffect "; def $0", "=a"() call void asm sideeffect "; use $0", "a"(i128 %val) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll index 90fcb51..fa97380 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll @@ -11,14 +11,11 @@ declare i32 @llvm.amdgcn.cluster.id.z() #0 define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; CHECK-UNKNOWN-LABEL: test_cluster_id_x: ; CHECK-UNKNOWN: ; %bb.0: -; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-UNKNOWN-NEXT: s_endpgm -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ; CHECK-MESA3D-LABEL: test_cluster_id_x: ; CHECK-MESA3D: .amd_kernel_code_t @@ -68,7 +65,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -98,14 +95,11 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_x: ; CHECK-G-UNKNOWN: ; %bb.0: -; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-G-UNKNOWN-NEXT: s_endpgm -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ; CHECK-G-MESA3D-LABEL: test_cluster_id_x: ; CHECK-G-MESA3D: .amd_kernel_code_t @@ -155,7 +149,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -190,14 +184,11 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) { define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_cluster_id_y: ; CHECK-UNKNOWN: ; %bb.0: -; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-UNKNOWN-NEXT: s_endpgm -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ; CHECK-MESA3D-LABEL: test_cluster_id_y: ; CHECK-MESA3D: .amd_kernel_code_t @@ -247,7 +238,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -277,14 +268,11 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_y: ; CHECK-G-UNKNOWN: ; %bb.0: -; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-G-UNKNOWN-NEXT: s_endpgm -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 ; ; CHECK-G-MESA3D-LABEL: test_cluster_id_y: ; CHECK-G-MESA3D: .amd_kernel_code_t @@ -334,7 +322,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -369,16 +357,14 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 { define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; CHECK-UNKNOWN-LABEL: test_cluster_id_z: ; CHECK-UNKNOWN: ; %bb.0: -; CHECK-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; CHECK-UNKNOWN-NEXT: s_lshr_b32 s2, ttmp7, 16 +; CHECK-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; CHECK-UNKNOWN-NEXT: s_wait_xcnt 0x0 +; CHECK-UNKNOWN-NEXT: s_lshr_b32 s0, ttmp7, 16 ; CHECK-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-UNKNOWN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; CHECK-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-UNKNOWN-NEXT: global_store_b32 v0, v1, s[0:1] +; CHECK-UNKNOWN-NEXT: global_store_b32 v0, v1, s[2:3] ; CHECK-UNKNOWN-NEXT: s_endpgm -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 -; CHECK-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; ; CHECK-MESA3D-LABEL: test_cluster_id_z: ; CHECK-MESA3D: .amd_kernel_code_t @@ -428,7 +414,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -460,16 +446,14 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_z: ; CHECK-G-UNKNOWN: ; %bb.0: -; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; CHECK-G-UNKNOWN-NEXT: s_lshr_b32 s2, ttmp7, 16 +; CHECK-G-UNKNOWN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 +; CHECK-G-UNKNOWN-NEXT: s_wait_xcnt 0x0 +; CHECK-G-UNKNOWN-NEXT: s_lshr_b32 s0, ttmp7, 16 ; CHECK-G-UNKNOWN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; CHECK-G-UNKNOWN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0 ; CHECK-G-UNKNOWN-NEXT: s_wait_kmcnt 0x0 -; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[0:1] +; CHECK-G-UNKNOWN-NEXT: global_store_b32 v1, v0, s[2:3] ; CHECK-G-UNKNOWN-NEXT: s_endpgm -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 -; CHECK-G-UNKNOWN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 ; ; CHECK-G-MESA3D-LABEL: test_cluster_id_z: ; CHECK-G-MESA3D: .amd_kernel_code_t @@ -519,7 +503,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll index aa3b7b3..3ef84a3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -246,7 +246,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -330,7 +330,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -421,7 +421,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -509,7 +509,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -600,7 +600,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -775,7 +775,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -863,7 +863,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -956,7 +956,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -1044,7 +1044,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -1135,7 +1135,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -1219,7 +1219,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll index afe37e3..b8ff9e5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll @@ -65,7 +65,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -153,7 +153,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll index 7ea4fa5..9bca696 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -155,7 +155,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -246,7 +246,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -330,7 +330,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -421,7 +421,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -509,7 +509,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -600,7 +600,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -684,7 +684,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -775,7 +775,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 { ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -863,7 +863,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 { ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -954,7 +954,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o ; CHECK-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-MESA3D-NEXT: gds_segment_byte_size = 0 @@ -1038,7 +1038,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o ; CHECK-G-MESA3D-NEXT: is_ptr64 = 1 ; CHECK-G-MESA3D-NEXT: is_dynamic_callstack = 0 ; CHECK-G-MESA3D-NEXT: is_debug_enabled = 0 -; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 0 +; CHECK-G-MESA3D-NEXT: is_xnack_enabled = 1 ; CHECK-G-MESA3D-NEXT: workitem_private_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: workgroup_group_segment_byte_size = 0 ; CHECK-G-MESA3D-NEXT: gds_segment_byte_size = 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll index 56215ca..67d0410 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll @@ -59,21 +59,20 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX9-NEXT: global_store_dword v[0:1], v0, off ; GFX9-NEXT: s_endpgm ; -; GFX1250-SDAG-LABEL: is_private_vgpr: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS -; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v1 -; GFX1250-SDAG-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0 -; GFX1250-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v0, off -; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-LABEL: is_private_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: global_store_b32 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm ; ; CI-GISEL-LABEL: is_private_vgpr: ; CI-GISEL: ; %bb.0: @@ -122,22 +121,6 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) { ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_endpgm -; -; GFX1250-GISEL-LABEL: is_private_vgpr: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v1, v2 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x4000000, v0 -; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v0, off -; GFX1250-GISEL-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds ptr, ptr addrspace(1) %ptr.ptr, i32 %id %ptr = load volatile ptr, ptr addrspace(1) %gep @@ -206,9 +189,8 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; GFX1250-SDAG-LABEL: is_private_sgpr: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: s_xor_b32 s0, s0, s1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s0, 0x4000000 ; GFX1250-SDAG-NEXT: s_cselect_b32 s0, -1, 0 @@ -285,9 +267,8 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_xor_b32 s0, s1, s0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s1, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s0, 0x4000000 ; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB1_2 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %bb0 @@ -311,5 +292,4 @@ bb1: ; CI: {{.*}} ; GFX10-GISEL: {{.*}} ; GFX11-GISEL: {{.*}} -; GFX1250: {{.*}} ; SI-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll index 4f7bbf8..42a50bb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll @@ -5,13 +5,13 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_bcast_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s3, s4 +; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s3, s6 ; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2) @@ -92,13 +92,13 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_down_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s3, s4 +; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s3, s6 ; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2) @@ -179,13 +179,13 @@ define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_up_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s3, s4 +; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s3, s6 ; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2) @@ -266,13 +266,13 @@ define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) { ; GFX1250-LABEL: v_permlane_xor_b32_vss: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1250-NEXT: s_load_b32 s6, s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s3, s4 +; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s3, s6 ; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-NEXT: s_endpgm %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2) diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir index 76e2092..abcae69 100644 --- a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir +++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir @@ -69,9 +69,9 @@ body: | bb.0: ; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; GCN-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1 %0:sgpr_64 = IMPLICIT_DEF %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll index 1e6b77e..4ad161c 100644 --- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -471,13 +471,13 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r ; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 ; GFX1250-NEXT: s_cbranch_scc1 .LBB4_3 ; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader -; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1] -; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] ; GFX1250-NEXT: .LBB4_2: ; %for.body @@ -602,13 +602,13 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d, ; GFX1250-NEXT: s_cmp_eq_u32 s0, 0 ; GFX1250-NEXT: s_cbranch_scc1 .LBB5_3 ; GFX1250-NEXT: ; %bb.1: ; %for.body.preheader -; GFX1250-NEXT: s_load_b128 s[4:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 4, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[6:7], v[0:1] -; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[4:5], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], s[10:11], v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], s[8:9], v[0:1] ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0xb0, v[2:3] ; GFX1250-NEXT: .LBB5_2: ; %for.body diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index dbcd370..08ec0c8 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -1117,18 +1117,19 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; ; GFX1250-LABEL: mad_i64_i32_uniform: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1250-NEXT: s_mov_b32 s7, 0 +; GFX1250-NEXT: s_mov_b32 s5, 0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s4, s2 ; GFX1250-NEXT: s_mov_b32 s2, s3 -; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_mov_b32 s3, s5 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3] -; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_mul_u64 s[2:3], s[4:5], s[2:3] +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[6:7] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir index 61f2629..c19d5a6 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -33,7 +33,7 @@ name: asm_write_vgpr_accvgpr_write_read body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -47,7 +47,7 @@ name: asm_write_vgpr_accvgpr_write_read_partialnop body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr0 S_NOP 0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -60,7 +60,7 @@ name: asm_write_vgpr_accvgpr_write_read_otherreg body: | bb.0: liveins: $vgpr0 - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr1 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2031626 /* regdef:VGPR_32 */, def $vgpr1 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index fef9a9a..ae08054 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -257,16 +257,15 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: v_test_imax_sge_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_i8 s2, s[2:3], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_i8 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_i8 s4, s[2:3], 0x0 +; GFX1250-NEXT: s_load_i8 s5, s[6:7], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_max_i32 s2, s2, s3 +; GFX1250-NEXT: s_max_i32 s2, s4, s5 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] @@ -701,16 +700,15 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: v_test_umax_uge_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_u8 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_u8 s4, s[2:3], 0x0 +; GFX1250-NEXT: s_load_u8 s5, s[6:7], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_max_u32 s2, s2, s3 +; GFX1250-NEXT: s_max_u32 s2, s4, s5 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b32_e32 v1, s2 ; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] @@ -777,13 +775,12 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset -; GFX1250-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b32 s6, s[0:1], 0x0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_u32_e32 v0, s2, v0 -; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX1250-NEXT: v_max_u32_e32 v0, s6, v0 +; GFX1250-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX1250-NEXT: s_endpgm ; ; EG-LABEL: v_test_umax_ugt_i32: @@ -1122,12 +1119,12 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umax_ugt_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm ; @@ -1175,12 +1172,12 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umax_uge_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_max_u64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm ; @@ -1228,12 +1225,12 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imax_sgt_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm ; @@ -1281,12 +1278,12 @@ define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imax_sge_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_max_i64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 311527d..6a3d31f 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -131,14 +131,14 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imin_sle_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -1172,14 +1172,14 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16 ; ; GFX1250-LABEL: s_test_imin_sle_v4i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: v_pk_min_i16 v1, s1, s3 ; GFX1250-NEXT: v_pk_min_i16 v0, s0, s2 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1250-NEXT: s_endpgm %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b @@ -1307,14 +1307,14 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imin_slt_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -1484,14 +1484,14 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_imin_slt_i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_u16 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_u16 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_u16 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_i16 v1, v1, v2 ; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset @@ -1686,16 +1686,16 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32 ; ; GFX1250-LABEL: s_test_imin_slt_v2i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x8 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_min_i32 s0, s0, s2 ; GFX1250-NEXT: s_min_i32 s1, s1, s3 ; GFX1250-NEXT: v_mov_b32_e32 v0, s0 ; GFX1250-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1250-NEXT: s_endpgm %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b @@ -2011,14 +2011,14 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_umin_ule_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -2171,16 +2171,16 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_umin_ule_v3i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 4, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b96 v[0:2], v3, s[2:3] -; GFX1250-NEXT: global_load_b96 v[4:6], v3, s[4:5] +; GFX1250-NEXT: global_load_b96 v[4:6], v3, s[6:7] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u32_e32 v2, v2, v6 ; GFX1250-NEXT: v_min_u32_e32 v1, v1, v5 @@ -2374,14 +2374,14 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_umin_ule_v3i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b64 v[0:1], v4, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b64 v[2:3], v4, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b64 v[2:3], v4, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_xcnt 0x0 ; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -2611,14 +2611,14 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp ; ; GFX1250-LABEL: v_test_umin_ult_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -2771,14 +2771,14 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa ; ; GFX1250-LABEL: v_test_umin_ult_i8: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3] -; GFX1250-NEXT: global_load_u8 v2, v0, s[4:5] +; GFX1250-NEXT: global_load_u8 v2, v0, s[6:7] ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_min_u16 v1, v1, v2 ; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1] @@ -3023,23 +3023,22 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ; ; GFX1250-LABEL: v_test_umin_ult_i32_multi_use: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b32 s5, s[6:7], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[12:13], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[14:15], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_cmp_lt_u32 s4, s5 -; GFX1250-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1250-NEXT: s_cmp_lt_u32 s0, s1 +; GFX1250-NEXT: s_cselect_b32 s2, -1, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6 -; GFX1250-NEXT: s_and_b32 s6, s6, exec_lo -; GFX1250-NEXT: s_cselect_b32 s4, s4, s5 -; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX1250-NEXT: s_and_b32 s2, s2, exec_lo +; GFX1250-NEXT: s_cselect_b32 s0, s0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b32 v1, v2, s[0:1] -; GFX1250-NEXT: global_store_b8 v1, v0, s[2:3] +; GFX1250-NEXT: global_store_b32 v1, v2, s[8:9] +; GFX1250-NEXT: global_store_b8 v1, v0, s[10:11] ; GFX1250-NEXT: s_endpgm %a = load i32, ptr addrspace(1) %aptr, align 4 %b = load i32, ptr addrspace(1) %bptr, align 4 @@ -3220,12 +3219,12 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; ; GFX1250-LABEL: v_test_umin_ult_i16_multi_use: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x0 ; GFX1250-NEXT: v_mov_b32_e32 v0, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_load_u16 v1, v0, s[6:7] -; GFX1250-NEXT: global_load_u16 v2, v0, s[4:5] +; GFX1250-NEXT: global_load_u16 v1, v0, s[14:15] +; GFX1250-NEXT: global_load_u16 v2, v0, s[12:13] ; GFX1250-NEXT: s_wait_loadcnt 0x1 ; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 @@ -3235,8 +3234,8 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX1250-NEXT: global_store_b8 v0, v2, s[2:3] +; GFX1250-NEXT: global_store_b16 v0, v1, s[8:9] +; GFX1250-NEXT: global_store_b8 v0, v2, s[10:11] ; GFX1250-NEXT: s_endpgm %a = load i16, ptr addrspace(1) %aptr, align 2 %b = load i16, ptr addrspace(1) %bptr, align 2 @@ -4338,12 +4337,12 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umin_ult_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %tmp = icmp ult i64 %a, %b @@ -4462,12 +4461,12 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_umin_ule_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_min_u64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %tmp = icmp ule i64 %a, %b @@ -4586,12 +4585,12 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imin_slt_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %tmp = icmp slt i64 %a, %b @@ -4710,12 +4709,12 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 ; ; GFX1250-LABEL: test_imin_sle_i64: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[4:5] +; GFX1250-NEXT: v_min_i64 v[0:1], s[2:3], s[6:7] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm %tmp = icmp sle i64 %a, %b @@ -4872,14 +4871,14 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_imin_sle_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset @@ -5042,14 +5041,14 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr ; ; GFX1250-LABEL: v_test_imin_ule_v2i16: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scale_offset -; GFX1250-NEXT: global_load_b32 v2, v0, s[4:5] scale_offset +; GFX1250-NEXT: global_load_b32 v2, v0, s[6:7] scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_pk_min_u16 v1, v1, v2 ; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index baccb4c..d29847e 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -450,6 +450,7 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a, ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x34 +; GFX1250-NEXT: ; kill: killed $sgpr4_sgpr5 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mul_i32 s2, s3, s2 ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 @@ -613,25 +614,25 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add ; ; GFX1250-LABEL: v_trunc_i64_mul_to_i32: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s6, s10 -; GFX1250-NEXT: s_mov_b32 s7, s11 +; GFX1250-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s6 +; GFX1250-NEXT: s_mov_b32 s15, s7 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s12, s2 ; GFX1250-NEXT: s_mov_b32 s13, s3 ; GFX1250-NEXT: buffer_load_b32 v0, off, s[12:15], null -; GFX1250-NEXT: buffer_load_b32 v1, off, s[4:7], null -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: buffer_load_b32 v1, off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mul_lo_u32 v0, v1, v0 -; GFX1250-NEXT: buffer_store_b32 v0, off, s[8:11], null +; GFX1250-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; ; EG-LABEL: v_trunc_i64_mul_to_i32: @@ -2091,11 +2092,11 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun ; ; GFX1250-LABEL: s_mul_i64: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[4:5] +; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7] ; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX1250-NEXT: s_mov_b32 s2, -1 @@ -2292,25 +2293,25 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; ; GFX1250-LABEL: v_mul_i64: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1250-NEXT: s_mov_b32 s10, -1 -; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s14, s10 -; GFX1250-NEXT: s_mov_b32 s15, s11 -; GFX1250-NEXT: s_mov_b32 s6, s10 -; GFX1250-NEXT: s_mov_b32 s7, s11 +; GFX1250-NEXT: s_load_b64 s[8:9], s[4:5], 0x34 +; GFX1250-NEXT: s_mov_b32 s6, -1 +; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s14, s6 +; GFX1250-NEXT: s_mov_b32 s15, s7 +; GFX1250-NEXT: s_mov_b32 s10, s6 +; GFX1250-NEXT: s_mov_b32 s11, s7 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_mov_b32 s12, s2 ; GFX1250-NEXT: s_mov_b32 s13, s3 ; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[12:15], null -; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[4:7], null -; GFX1250-NEXT: s_mov_b32 s8, s0 -; GFX1250-NEXT: s_mov_b32 s9, s1 +; GFX1250-NEXT: buffer_load_b64 v[2:3], off, s[8:11], null +; GFX1250-NEXT: s_mov_b32 s4, s0 +; GFX1250-NEXT: s_mov_b32 s5, s1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3] -; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1250-NEXT: s_endpgm ; ; EG-LABEL: v_mul_i64: @@ -2845,30 +2846,30 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace( ; ; GFX1250-LABEL: mul64_in_branch: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1250-NEXT: s_cmp_lg_u64 s[12:13], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_3 ; GFX1250-NEXT: ; %bb.1: ; %else -; GFX1250-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7] +; GFX1250-NEXT: s_mul_u64 s[0:1], s[12:13], s[14:15] ; GFX1250-NEXT: s_cbranch_execnz .LBB16_4 ; GFX1250-NEXT: .LBB16_2: ; %if -; GFX1250-NEXT: s_mov_b32 s7, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s6, -1 -; GFX1250-NEXT: s_mov_b32 s4, s2 -; GFX1250-NEXT: s_mov_b32 s5, s3 -; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null +; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s0, s10 +; GFX1250-NEXT: s_mov_b32 s1, s11 +; GFX1250-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null ; GFX1250-NEXT: s_branch .LBB16_5 ; GFX1250-NEXT: .LBB16_3: -; GFX1250-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX1250-NEXT: ; implicit-def: $sgpr0_sgpr1 ; GFX1250-NEXT: s_branch .LBB16_2 ; GFX1250-NEXT: .LBB16_4: -; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5] +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX1250-NEXT: .LBB16_5: ; %endif -; GFX1250-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1250-NEXT: s_mov_b32 s2, -1 +; GFX1250-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1250-NEXT: s_mov_b32 s10, -1 ; GFX1250-NEXT: s_wait_loadcnt 0x0 -; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null +; GFX1250-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null ; GFX1250-NEXT: s_endpgm ; ; EG-LABEL: mul64_in_branch: diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index b0651ef..78207c2 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -340,46 +340,46 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fadd_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], v[34:35] @@ -395,58 +395,58 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], v[36:37] ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[38:39] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_v32_vs: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[40:41] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[42:43] @@ -466,14 +466,14 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], v[38:39] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -1597,46 +1597,46 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fmul_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s20 :: v_dual_mov_b32 v33, s21 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s22 :: v_dual_mov_b32 v35, s23 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s18 :: v_dual_mov_b32 v39, s29 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s30 :: v_dual_mov_b32 v41, s31 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s24 :: v_dual_mov_b32 v37, s19 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s28 :: v_dual_mov_b32 v55, s15 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s3 :: v_dual_mov_b32 v52, s12 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s13 :: v_dual_mov_b32 v54, s14 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s7 :: v_dual_mov_b32 v50, s2 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s27 :: v_dual_mov_b32 v46, s4 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s5 :: v_dual_mov_b32 v48, s6 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s25 :: v_dual_mov_b32 v44, s26 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s40 :: v_dual_mov_b32 v33, s41 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s42 :: v_dual_mov_b32 v35, s43 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v36, s38 :: v_dual_mov_b32 v39, s49 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v40, s50 :: v_dual_mov_b32 v41, s51 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v42, s44 :: v_dual_mov_b32 v37, s39 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v38, s48 :: v_dual_mov_b32 v55, s23 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v51, s11 :: v_dual_mov_b32 v52, s20 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v53, s21 :: v_dual_mov_b32 v54, s22 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v49, s15 :: v_dual_mov_b32 v50, s10 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v45, s47 :: v_dual_mov_b32 v46, s12 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v47, s13 :: v_dual_mov_b32 v48, s14 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v43, s45 :: v_dual_mov_b32 v44, s46 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s8 :: v_dual_mov_b32 v33, s9 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s10 :: v_dual_mov_b32 v35, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v32, s16 :: v_dual_mov_b32 v33, s17 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v34, s18 :: v_dual_mov_b32 v35, s19 ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2 ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[20:21], v[20:21], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[22:23], v[22:23], v[34:35] @@ -1652,58 +1652,58 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[14:15], v[14:15], v[36:37] ; GFX1250-SDAG-NEXT: v_pk_mul_f32 v[12:13], v[12:13], v[38:39] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fmul_v32_vs: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[0:1], v[0:1], v[32:33] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[2:3], v[2:3], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[4:5], v[4:5], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[6:7], v[6:7], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[8:9], v[8:9], v[40:41] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[10:11], v[10:11], v[42:43] @@ -1723,14 +1723,14 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[28:29], v[28:29], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_mul_f32 v[30:31], v[30:31], v[38:39] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -2428,46 +2428,46 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; ; GFX1250-SDAG-LABEL: fma_v32_vs: ; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[34:35] offset:16 -; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[34:35] -; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-SDAG-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v56, s[0:1] offset:16 +; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v56, s[0:1] +; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[20:21] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[22:23] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[30:31] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[28:29] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[12:13] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[14:15] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[4:5] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[6:7] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[24:25] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[26:27] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[18:19] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[40:41] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[42:43] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[50:51] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[48:49] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[52:53], s[20:21] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[54:55], s[22:23] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[46:47], s[12:13] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[48:49], s[14:15] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[42:43], s[44:45] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[44:45], s[46:47] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[36:37], s[38:39] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7 ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[34:35], s[18:19] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6 ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[40:41], v[40:41] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[0:1] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[40:41], s[8:9] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[38:39], v[38:39] -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[16:17] +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[38:39], s[36:37] ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[52:53], v[52:53] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[20:21], v[20:21], v[32:33], v[32:33] @@ -2482,58 +2482,58 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[14:15], v[14:15], v[36:37], v[36:37] ; GFX1250-SDAG-NEXT: v_pk_fma_f32 v[12:13], v[12:13], v[38:39], v[38:39] ; GFX1250-SDAG-NEXT: s_clause 0x7 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:96 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:64 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:80 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:48 -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[34:35] -; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[34:35] offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:96 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:64 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:80 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[12:15], s[0:1] +; GFX1250-SDAG-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:16 ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fma_v32_vs: ; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v56, 7, v0 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[34:35] -; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[34:35] offset:112 -; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4 +; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v56, s[0:1] +; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v56, s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v56, s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v56, s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v56, s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v56, s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v56, s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v56, s[0:1] offset:112 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: s_load_b512 s[36:51], s[4:5], 0xa4 +; GFX1250-GISEL-NEXT: s_load_b512 s[8:23], s[4:5], 0xe4 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[24:25] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[26:27] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[28:29] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[30:31] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[36:37] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[38:39] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[40:41] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[42:43] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[40:41], s[44:45] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[42:43], s[46:47] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[44:45], s[48:49] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[46:47], s[50:51] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[48:49], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[50:51], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[52:53], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[54:55], s[14:15] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7 ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[32:33], v[32:33] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[2:3], v[2:3], v[34:35], v[34:35] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[8:9] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[32:33], s[16:17] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[18:19] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6 ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[4:5], v[4:5], v[36:37], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[6:7], v[6:7], v[38:39], v[38:39] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[12:13] -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[20:21] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[22:23] ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5 ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[8:9], v[8:9], v[40:41], v[40:41] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[10:11], v[10:11], v[42:43], v[42:43] @@ -2553,14 +2553,14 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) { ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[28:29], v[28:29], v[36:37], v[36:37] ; GFX1250-GISEL-NEXT: v_pk_fma_f32 v[30:31], v[30:31], v[38:39], v[38:39] ; GFX1250-GISEL-NEXT: s_clause 0x7 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[34:35] -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[34:35] offset:16 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[34:35] offset:32 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[34:35] offset:48 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[34:35] offset:64 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[34:35] offset:80 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[34:35] offset:96 -; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[34:35] offset:112 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[0:3], s[0:1] +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:48 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:64 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:96 +; GFX1250-GISEL-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:112 ; GFX1250-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id @@ -3529,9 +3529,9 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; ; GFX1250-SDAG-LABEL: fadd_fadd_fsub: ; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-SDAG-NEXT: s_add_f32 s2, s1, s3 @@ -3541,14 +3541,14 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, v2 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[4:5], v[0:1] neg_lo:[0,1] neg_hi:[0,1] -; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[6:7] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: fadd_fadd_fsub: ; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x34 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] @@ -3560,7 +3560,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p ; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_subrev_f32 v3, s3, v0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[4:5] +; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[6:7] ; GFX1250-GISEL-NEXT: s_endpgm bb: %i12 = fadd <2 x float> %arg, %arg1 diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 6509d80..f88b1bf 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -12,7 +12,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX908-NEXT: {{ $}} ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def %25 + ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def %25 ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:av_128 = COPY %25 ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def %27 ; REGALLOC-GFX908-NEXT: SI_SPILL_AV64_SAVE %27, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) @@ -37,7 +37,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15 ; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7798794 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7929866 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3735562 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1 ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5) @@ -61,7 +61,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; REGALLOC-GFX90A-NEXT: {{ $}} ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef %6:agpr_32 - ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def %23 + ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def %23 ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:av_128_align2 = COPY %23 ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def %21 ; REGALLOC-GFX90A-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY %21 @@ -80,7 +80,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: liveins: $sgpr4_sgpr5 ; PEI-GFX90A-NEXT: {{ $}} ; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, undef renamable $agpr0 - ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7995402 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 8257546 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3 ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3997706 /* regdef:VReg_64_Align2 */, def renamable $vgpr2_vgpr3 ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index b717f85..6671201 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -186,12 +186,12 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32 ; ; GFX1250-LABEL: mixed_inreg_block_count_x: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b32 s2, s[0:1], 0x10 -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: s_load_b32 s4, s[0:1], 0x10 +; GFX1250-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX1250-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %load = load i32, ptr addrspace(4) %imp_arg_ptr diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index 4d367ef..c1764c9 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -346,10 +346,10 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad ; ; GFX1250-LABEL: byref_preload_arg: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x100 +; GFX1250-NEXT: s_load_b64 s[4:5], s[0:1], 0x100 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1250-NEXT: v_mov_b32_e32 v2, s5 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS @@ -404,10 +404,10 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o ; ; GFX1250-LABEL: byref_staggered_preload_arg: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x100 +; GFX1250-NEXT: s_load_b64 s[4:5], s[0:1], 0x100 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 -; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1250-NEXT: v_mov_b32_e32 v2, s5 ; GFX1250-NEXT: global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_storecnt 0x0 ; GFX1250-NEXT: global_store_b32 v0, v2, s[2:3] scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir index b32e997..80afe7a 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir @@ -43,17 +43,17 @@ machineFunctionInfo: body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2424842 /* regdef:AGPR_32 */, implicit-def $agpr0 %14:vgpr_32 = COPY killed $agpr0 - INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27394058 /* regdef:VReg_512 */, def %7, 13697034 /* regdef:VReg_256 */, def %8, 6225930 /* regdef:VReg_128 */, def %9, 4915210 /* regdef:VReg_96 */, def %10, 4915210 /* regdef:VReg_96 */, def %11 + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 40042506 /* regdef:VReg_512 */, def %7, 19464202 /* regdef:VReg_256 */, def %8, 7929866 /* regdef:VReg_128 */, def %9, 5963786 /* regdef:VReg_96 */, def %10, 5963786 /* regdef:VReg_96 */, def %11 INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27394057 /* reguse:VReg_512 */, %7 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13697033 /* reguse:VReg_256 */, %8 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, %9 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:VReg_96 */, %10 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:VReg_96 */, %11 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 40042505 /* reguse:VReg_512 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 19464201 /* reguse:VReg_256 */, %8 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7929865 /* reguse:VReg_128 */, %9 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5963785 /* reguse:VReg_96 */, %10 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 5963785 /* reguse:VReg_96 */, %11 $agpr1 = COPY %14 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2424841 /* reguse:AGPR_32 */, killed $agpr1 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir index 1b09f5d..ad490f8 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-copy-from.mir @@ -41,9 +41,9 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_64_align2 = COPY $vgpr0_vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:av_64_align2 = COPY $vgpr2_vgpr3 ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] - ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:AV_64_Align2 */, [[V_MFMA_F64_4X4X4F64_e64_]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_128_align2 = COPY [[GLOBAL_LOAD_DWORDX4_]] + ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]]:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[COPY3]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]] ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 %1:av_64_align2 = COPY $vgpr0_vgpr1 @@ -51,7 +51,7 @@ body: | %3:areg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_128_align2 = COPY %3 %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3801097 /* reguse:VReg_64_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3997705 /* reguse:VReg_64_Align2 */, %5 SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir index d7b713a..0b4e662 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-insert-extract.mir @@ -19,7 +19,7 @@ body: | ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -30,7 +30,7 @@ body: | %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -172,7 +172,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]].sub2_sub3:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -183,7 +183,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4.sub2_sub3 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -208,7 +208,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_vgprcd_e64_:%[0-9]+]].sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX2_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_vgprcd_e64_]].sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2_sub3:areg_128_align2 = IMPLICIT_DEF - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]].sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN @@ -219,7 +219,7 @@ body: | undef %4.sub2_sub3:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub1:areg_128_align2 = COPY %4.sub2 %5.sub2_sub3 = IMPLICIT_DEF - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) GLOBAL_STORE_DWORDX2 %0, %5.sub2_sub3, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir index 57f611b..dcf3b8b 100644 --- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir +++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr-subreg-src2-chain.mir @@ -17,7 +17,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -26,7 +26,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -47,7 +47,7 @@ body: | ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:areg_128_align2 = GLOBAL_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec :: (load (s128), addrspace 1) ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -56,7 +56,7 @@ body: | %3:vreg_128_align2 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, implicit $exec :: (load (s128), addrspace 1) %4:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %3.sub2_sub3, 0, 0, 0, implicit $mode, implicit $exec undef %5.sub0_sub1:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %5 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %5 GLOBAL_STORE_DWORDX4 %0, %5, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... @@ -79,7 +79,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VS_64_with_sub0_in_VS_32_Lo128 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -90,7 +90,7 @@ body: | %other_use:vreg_64_align2 = COPY %4.sub0_sub1 %5:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %6:areg_64_align2 = COPY %5 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %6:areg_64_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %6:areg_64_align2 GLOBAL_STORE_DWORDX2 %0, %6, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN ... @@ -114,7 +114,7 @@ body: | ; CHECK-NEXT: undef [[V_MFMA_F64_4X4X4F64_e64_1:%[0-9]+]].sub0_sub1:areg_128_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:VS_64_with_sub0_in_VS_32_Lo128 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX2 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s64), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -126,7 +126,7 @@ body: | undef %5.sub0_sub1:vreg_128_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %4, 0, 0, 0, implicit $mode, implicit $exec %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec %7:areg_64_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4521993 /* reguse:AReg_64_Align2 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4915209 /* reguse:AReg_64_Align2 */, %7 GLOBAL_STORE_DWORDX2 %0, %7, 0, 0, implicit $exec :: (store (s64), addrspace 1) SI_RETURN @@ -151,7 +151,7 @@ body: | ; CHECK-NEXT: dead %other_use:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1 ; CHECK-NEXT: [[V_MFMA_F64_4X4X4F64_e64_2:%[0-9]+]]:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 [[COPY1]], [[COPY2]], [[V_MFMA_F64_4X4X4F64_e64_1]].sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_2]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -163,7 +163,7 @@ body: | %other_use:vreg_64_align2 = COPY %5.sub0_sub1 %6:vreg_64_align2 = V_MFMA_F64_4X4X4F64_vgprcd_e64 %1, %2, %5.sub0_sub1, 0, 0, 0, implicit $mode, implicit $exec undef %8.sub0_sub1:areg_128_align2 = COPY %6 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %8:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %8, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN @@ -231,7 +231,7 @@ body: | ; CHECK-NEXT: dead %other_use1:vreg_64_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub2_sub3 ; CHECK-NEXT: dead %other_use2:vreg_64 = COPY [[V_MFMA_F64_4X4X4F64_e64_]].sub1_sub2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:areg_128_align2 = COPY [[V_MFMA_F64_4X4X4F64_e64_]] - ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, [[COPY3]] + ; CHECK-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, [[COPY3]] ; CHECK-NEXT: GLOBAL_STORE_DWORDX4 [[COPY]], [[COPY3]], 0, 0, implicit $exec :: (store (s128), addrspace 1) ; CHECK-NEXT: SI_RETURN %0:vreg_64_align2 = COPY $vgpr4_vgpr5 @@ -245,7 +245,7 @@ body: | %other_use1:vreg_64_align2 = COPY %4.sub2_sub3 %other_use2:vreg_64 = COPY %4.sub1_sub2 %6:areg_128_align2 = COPY %4 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 8650761 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9568265 /* reguse:AReg_128_Align2 */, %6:areg_128_align2 GLOBAL_STORE_DWORDX4 %0, %6, 0, 0, implicit $exec :: (store (s128), addrspace 1) SI_RETURN ... diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index 335d58c..a18847b 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -324,11 +324,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1] -; SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; SDAG-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v3 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -350,10 +348,9 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; SDAG-NEXT: s_cbranch_execz .LBB21_2 ; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private -; SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; SDAG-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v2 ; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off @@ -367,12 +364,12 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v0, src_flat_scratch_base_hi +; GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GISEL-NEXT: v_lshl_add_u64 v[4:5], v[2:3], 3, s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_xor_b32_e32 v0, v5, v0 +; GISEL-NEXT: v_xor_b32_e32 v0, src_flat_scratch_base_hi, v5 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo @@ -394,11 +391,10 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 ; GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private -; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-NEXT: v_sub_nc_u32_e32 v0, v4, v0 +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: v_subrev_nc_u32_e32 v0, src_flat_scratch_base_lo, v4 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GISEL-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll index b5bb68e..e0ea08d 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll @@ -97,9 +97,9 @@ entry: define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b64_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b64 s[4:5], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -111,10 +111,10 @@ entry: define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b96_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -126,10 +126,10 @@ entry: define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b128_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -141,12 +141,12 @@ entry: define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b256_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -158,16 +158,16 @@ entry: define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { ; GCN-LABEL: s_load_b512_idxprom: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GCN-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13 +; GCN-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v11, s15 +; GCN-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v13, s17 +; GCN-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s19 ; GCN-NEXT: ; return to shader part epilog entry: %idxprom = zext i32 %idx to i64 @@ -275,11 +275,11 @@ entry: define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) { ; GCN-LABEL: s_load_b64_idxprom_range: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b64 s[2:3], s[0:1], s4 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 @@ -294,10 +294,10 @@ define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 @@ -312,10 +312,10 @@ define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 @@ -330,12 +330,12 @@ define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 @@ -350,16 +350,16 @@ define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 -; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 -; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 -; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 -; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 -; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GCN-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GCN-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v5, s9 +; GCN-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v7, s11 +; GCN-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s13 +; GCN-NEXT: v_dual_mov_b32 v10, s14 :: v_dual_mov_b32 v11, s15 +; GCN-NEXT: v_dual_mov_b32 v12, s16 :: v_dual_mov_b32 v13, s17 +; GCN-NEXT: v_dual_mov_b32 v14, s18 :: v_dual_mov_b32 v15, s19 ; GCN-NEXT: ; return to shader part epilog entry: %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir index 93cc12f..9484417 100644 --- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir @@ -57,6 +57,7 @@ body: | %4:vgpr_16 = COPY %3:sgpr_lo16 %5:vgpr_32 = V_ALIGNBIT_B32_t16_e64 0, %0:sreg_32, 0, killed %1:sreg_32, 0, killed %4:vgpr_16, 0, 0, implicit $exec S_ENDPGM 0, implicit %5 +... --- name: fold_16bit_madmix_clamp @@ -207,3 +208,27 @@ body: | $vgpr0 = COPY %4 S_ENDPGM 0, implicit $vgpr0 ... + +--- +name: fold_imm16_across_reg_sequence +tracksRegLiveness: true +registers: +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_imm16_across_reg_sequence + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[V_MOV_B16_t16_e64_1]], %subreg.hi16 + ; CHECK-NEXT: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F32_e64 0, -1, 0, -1, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F32_e64_]] + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec + %1:vgpr_16 = V_MOV_B16_t16_e64 0, -1, 0, implicit $exec + %2:vgpr_32 = REG_SEQUENCE %0, %subreg.lo16, %1, %subreg.hi16 + %3:vgpr_32 = nofpexcept V_MAX_F32_e64 0, %2, 0, %2, 0, 0, implicit $mode, implicit $exec + $vgpr0 = COPY %3 + S_ENDPGM 0, implicit $vgpr0 +... diff --git a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll index f2ecfe8..3d74b17 100644 --- a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll +++ b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll @@ -17,16 +17,16 @@ define amdgpu_kernel void @v_ashr_pk_i8_i32(ptr addrspace(1) %out, i32 %src0, i3 ; ; GFX1250-LABEL: v_ashr_pk_i8_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_and_b32 s2, s2, 31 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_ashr_pk_i8_i32 v0, s0, s1, v0 -; GFX1250-NEXT: global_store_b16 v1, v0, s[4:5] +; GFX1250-NEXT: global_store_b16 v1, v0, s[6:7] ; GFX1250-NEXT: s_endpgm %insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0 %build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1 @@ -58,16 +58,16 @@ define amdgpu_kernel void @v_ashr_pk_u8_i32(ptr addrspace(1) %out, i32 %src0, i3 ; ; GFX1250-LABEL: v_ashr_pk_u8_i32: ; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_clause 0x1 ; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x2c -; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x24 +; GFX1250-NEXT: s_load_b64 s[6:7], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: s_and_b32 s2, s2, 31 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_mov_b32_e32 v0, s2 ; GFX1250-NEXT: v_ashr_pk_u8_i32 v0, s0, s1, v0 -; GFX1250-NEXT: global_store_b16 v1, v0, s[4:5] +; GFX1250-NEXT: global_store_b16 v1, v0, s[6:7] ; GFX1250-NEXT: s_endpgm %insert.0 = insertelement <2 x i32> poison, i32 %src0, i64 0 %build_vector = insertelement <2 x i32> %insert.0, i32 %src1, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index af8b9e7..6fe99d8 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -520,6 +520,7 @@ body: | ; GCN-NEXT: GLOBAL_STORE_DWORDX2 $vgpr0_vgpr1, $vgpr4_vgpr5, 16, 0, implicit $exec ; GCN-NEXT: S_WAIT_KMCNT 0 ; GCN-NEXT: $sgpr2 = S_ADD_I32 $sgpr0, 100, implicit-def $scc + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 20, implicit $exec $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 0, 0 :: (load (s64), addrspace 4) $vgpr0 = V_MOV_B32_e32 1, implicit $exec @@ -921,7 +922,6 @@ body: | $vgpr2 = V_MOV_B32_e32 1, implicit $exec ... -# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0. --- name: wait_kmcnt_with_outstanding_vmem tracksRegLiveness: true @@ -937,6 +937,7 @@ body: | ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec ; GCN-NEXT: S_WAIT_KMCNT 0 ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec @@ -944,7 +945,6 @@ body: | $vgpr0 = V_MOV_B32_e32 0, implicit $exec ... -# FIXME: Missing S_WAIT_XCNT before overwriting sgpr0. --- name: wait_loadcnt_with_outstanding_smem tracksRegLiveness: true @@ -960,6 +960,7 @@ body: | ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 ; GCN-NEXT: S_WAIT_LOADCNT 0 ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 @@ -967,7 +968,6 @@ body: | $sgpr0 = S_MOV_B32 0 ... -# TODO: Unnecessary wait before overwriting vgpr0. --- name: overwrite_vgpr_after_smem tracksRegLiveness: true @@ -981,14 +981,12 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 - ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 $vgpr0 = V_MOV_B32_e32 0, implicit $exec ... -# TODO: Unnecessary wait before overwriting sgpr0. --- name: overwrite_sgpr_after_vmem tracksRegLiveness: true @@ -1002,7 +1000,6 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll index a392692..6636eb5 100644 --- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll @@ -211,38 +211,39 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX1250-SDAG-LABEL: workgroup_id_xyz: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_bfe_u32 s0, ttmp6, 0x40014 -; GFX1250-SDAG-NEXT: s_lshr_b32 s6, ttmp7, 16 -; GFX1250-SDAG-NEXT: s_add_co_i32 s7, s0, 1 +; GFX1250-SDAG-NEXT: s_lshr_b32 s8, ttmp7, 16 +; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s0, 1 +; GFX1250-SDAG-NEXT: s_clause 0x1 ; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 +; GFX1250-SDAG-NEXT: s_mul_i32 s4, s8, s9 ; GFX1250-SDAG-NEXT: s_bfe_u32 s9, ttmp6, 0x40010 -; GFX1250-SDAG-NEXT: s_mul_i32 s7, s6, s7 -; GFX1250-SDAG-NEXT: s_bfe_u32 s8, ttmp6, 0x40008 +; GFX1250-SDAG-NEXT: s_bfe_u32 s5, ttmp6, 0x40008 ; GFX1250-SDAG-NEXT: s_and_b32 s10, ttmp7, 0xffff ; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s9, 1 ; GFX1250-SDAG-NEXT: s_bfe_u32 s11, ttmp6, 0x4000c -; GFX1250-SDAG-NEXT: s_add_co_i32 s8, s8, s7 -; GFX1250-SDAG-NEXT: s_mul_i32 s7, s10, s9 +; GFX1250-SDAG-NEXT: s_add_co_i32 s5, s5, s4 +; GFX1250-SDAG-NEXT: s_mul_i32 s4, s10, s9 ; GFX1250-SDAG-NEXT: s_bfe_u32 s9, ttmp6, 0x40004 ; GFX1250-SDAG-NEXT: s_add_co_i32 s11, s11, 1 -; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s9, s7 -; GFX1250-SDAG-NEXT: s_and_b32 s7, ttmp6, 15 +; GFX1250-SDAG-NEXT: s_add_co_i32 s9, s9, s4 +; GFX1250-SDAG-NEXT: s_and_b32 s4, ttmp6, 15 ; GFX1250-SDAG-NEXT: s_mul_i32 s11, ttmp9, s11 ; GFX1250-SDAG-NEXT: s_getreg_b32 s12, hwreg(HW_REG_IB_STS2, 6, 4) -; GFX1250-SDAG-NEXT: s_add_co_i32 s7, s7, s11 +; GFX1250-SDAG-NEXT: s_add_co_i32 s4, s4, s11 ; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s12, 0 -; GFX1250-SDAG-NEXT: s_cselect_b32 s7, ttmp9, s7 +; GFX1250-SDAG-NEXT: s_cselect_b32 s4, ttmp9, s4 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s7 -; GFX1250-SDAG-NEXT: s_cselect_b32 s7, s10, s9 -; GFX1250-SDAG-NEXT: s_cselect_b32 s6, s6, s8 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s6 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX1250-SDAG-NEXT: s_cselect_b32 s4, s10, s9 +; GFX1250-SDAG-NEXT: s_cselect_b32 s5, s8, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: s_clause 0x2 ; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX1250-SDAG-NEXT: global_store_b32 v0, v2, s[2:3] -; GFX1250-SDAG-NEXT: global_store_b32 v0, v3, s[4:5] +; GFX1250-SDAG-NEXT: global_store_b32 v0, v3, s[6:7] ; GFX1250-SDAG-NEXT: s_endpgm ; ; GFX1250-GISEL-LABEL: workgroup_id_xyz: @@ -250,39 +251,40 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac ; GFX1250-GISEL-NEXT: s_bfe_u32 s0, ttmp6, 0x4000c ; GFX1250-GISEL-NEXT: s_and_b32 s1, ttmp6, 15 ; GFX1250-GISEL-NEXT: s_add_co_i32 s0, s0, 1 -; GFX1250-GISEL-NEXT: s_getreg_b32 s6, hwreg(HW_REG_IB_STS2, 6, 4) +; GFX1250-GISEL-NEXT: s_getreg_b32 s8, hwreg(HW_REG_IB_STS2, 6, 4) ; GFX1250-GISEL-NEXT: s_mul_i32 s0, ttmp9, s0 ; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-GISEL-NEXT: s_add_co_i32 s1, s1, s0 -; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s6, 0 -; GFX1250-GISEL-NEXT: s_cselect_b32 s7, ttmp9, s1 +; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s8, 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s9, ttmp9, s1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s0, ttmp6, 0x40010 -; GFX1250-GISEL-NEXT: s_and_b32 s8, ttmp7, 0xffff +; GFX1250-GISEL-NEXT: s_and_b32 s10, ttmp7, 0xffff ; GFX1250-GISEL-NEXT: s_add_co_i32 s0, s0, 1 -; GFX1250-GISEL-NEXT: s_bfe_u32 s9, ttmp6, 0x40004 -; GFX1250-GISEL-NEXT: s_mul_i32 s10, s8, s0 +; GFX1250-GISEL-NEXT: s_bfe_u32 s11, ttmp6, 0x40004 +; GFX1250-GISEL-NEXT: s_mul_i32 s12, s10, s0 +; GFX1250-GISEL-NEXT: s_clause 0x1 ; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 +; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x10 +; GFX1250-GISEL-NEXT: s_add_co_i32 s11, s11, s12 +; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s8, 0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s9 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 -; GFX1250-GISEL-NEXT: s_add_co_i32 s9, s9, s10 -; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s6, 0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, s7 -; GFX1250-GISEL-NEXT: s_cselect_b32 s8, s8, s9 -; GFX1250-GISEL-NEXT: s_bfe_u32 s9, ttmp6, 0x40014 +; GFX1250-GISEL-NEXT: s_cselect_b32 s4, s10, s11 +; GFX1250-GISEL-NEXT: s_bfe_u32 s5, ttmp6, 0x40014 ; GFX1250-GISEL-NEXT: s_lshr_b32 s10, ttmp7, 16 -; GFX1250-GISEL-NEXT: s_add_co_i32 s9, s9, 1 +; GFX1250-GISEL-NEXT: s_add_co_i32 s5, s5, 1 ; GFX1250-GISEL-NEXT: s_bfe_u32 s11, ttmp6, 0x40008 -; GFX1250-GISEL-NEXT: s_mul_i32 s9, s10, s9 +; GFX1250-GISEL-NEXT: s_mul_i32 s5, s10, s5 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX1250-GISEL-NEXT: s_add_co_i32 s11, s11, s9 -; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s6, 0 -; GFX1250-GISEL-NEXT: s_cselect_b32 s6, s10, s11 -; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, s6 +; GFX1250-GISEL-NEXT: s_add_co_i32 s11, s11, s5 +; GFX1250-GISEL-NEXT: s_cmp_eq_u32 s8, 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s5, s10, s11 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX1250-GISEL-NEXT: s_clause 0x2 ; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX1250-GISEL-NEXT: global_store_b32 v1, v2, s[2:3] -; GFX1250-GISEL-NEXT: global_store_b32 v1, v3, s[4:5] +; GFX1250-GISEL-NEXT: global_store_b32 v1, v3, s[6:7] ; GFX1250-GISEL-NEXT: s_endpgm ; GFX12-LABEL: workgroup_id_xyz: ; GFX12: ; %bb.0: |