diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
58 files changed, 16314 insertions, 5186 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index a066b15..e6a8bac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -1917,8 +1917,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_movk_i32 s0, 0x3e84 +; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -1933,7 +1934,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3e84 +; GFX10-NEXT: s_movk_i32 s0, 0x3e80 +; GFX10-NEXT: s_add_i32 s0, s0, 4 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -1945,10 +1947,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX942-LABEL: store_load_large_imm_offset_kernel: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 13 +; GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, 15 -; GFX942-NEXT: s_movk_i32 s0, 0x3e84 +; GFX942-NEXT: s_add_i32 s0, s0, 4 ; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -1958,7 +1961,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX11-LABEL: store_load_large_imm_offset_kernel: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-NEXT: s_movk_i32 s0, 0x3e84 +; GFX11-NEXT: s_movk_i32 s0, 0x3e80 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s0, s0, 4 ; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc @@ -1986,8 +1991,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e84 +; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s0, 4 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -2002,7 +2008,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e84 +; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80 +; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s0, 4 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -2014,10 +2021,11 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_kernel: ; UNALIGNED_GFX942: ; %bb.0: ; %bb ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e84 +; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s0, 4 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -2027,7 +2035,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; UNALIGNED_GFX11-LABEL: store_load_large_imm_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e84 +; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80 +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s0, 4 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc @@ -2061,11 +2071,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_add_i32 s1, s32, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 15 -; GFX9-NEXT: s_add_i32 s0, s32, 0x3e84 +; GFX9-NEXT: s_add_i32 s0, s1, 4 ; GFX9-NEXT: scratch_store_dword off, v0, s0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -2076,8 +2088,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-NEXT: s_movk_i32 s0, 0x3e80 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_add_i32 s0, s32, 0x3e84 +; GFX10-NEXT: s_add_i32 s1, s32, s0 +; GFX10-NEXT: s_add_i32 s0, s1, 4 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -2089,11 +2103,13 @@ define void @store_load_large_imm_offset_foo() { ; GFX942-LABEL: store_load_large_imm_offset_foo: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; GFX942-NEXT: v_mov_b32_e32 v0, 13 +; GFX942-NEXT: s_add_i32 s1, s32, s0 ; GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, 15 -; GFX942-NEXT: s_add_i32 s0, s32, 0x3e84 +; GFX942-NEXT: s_add_i32 s0, s1, 4 ; GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -2104,7 +2120,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; GFX11-NEXT: s_add_i32 s0, s32, 0x3e84 +; GFX11-NEXT: s_movk_i32 s0, 0x3e80 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_add_i32 s1, s32, s0 +; GFX11-NEXT: s_add_i32 s0, s1, 4 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc @@ -2133,11 +2152,13 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX9-LABEL: store_load_large_imm_offset_foo: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX9-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX9-NEXT: s_add_i32 s1, s32, s0 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s32, 0x3e84 +; UNALIGNED_GFX9-NEXT: s_add_i32 s0, s1, 4 ; UNALIGNED_GFX9-NEXT: scratch_store_dword off, v0, s0 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, off, s0 glc @@ -2148,8 +2169,10 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX10-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s32, 0x3e84 +; UNALIGNED_GFX10-NEXT: s_add_i32 s1, s32, s0 +; UNALIGNED_GFX10-NEXT: s_add_i32 s0, s1, 4 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: scratch_store_dword off, v1, s0 @@ -2161,11 +2184,13 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX942-LABEL: store_load_large_imm_offset_foo: ; UNALIGNED_GFX942: ; %bb.0: ; %bb ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x3e80 ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 13 +; UNALIGNED_GFX942-NEXT: s_add_i32 s1, s32, s0 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v0, 15 -; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s32, 0x3e84 +; UNALIGNED_GFX942-NEXT: s_add_i32 s0, s1, 4 ; UNALIGNED_GFX942-NEXT: scratch_store_dword off, v0, s0 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 @@ -2176,7 +2201,10 @@ define void @store_load_large_imm_offset_foo() { ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15 -; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s32, 0x3e84 +; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x3e80 +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX11-NEXT: s_add_i32 s1, s32, s0 +; UNALIGNED_GFX11-NEXT: s_add_i32 s0, s1, 4 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 off, v1, s0 dlc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll index 7dce9ac..4c40009 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll @@ -997,8 +997,8 @@ define <33 x i32> @v33i32_func_v33i32_i32(ptr addrspace(1) %p, i32 %idx) #0 { ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY3]](s32) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 256 - ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C]] - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[MUL]](s64) + ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = nsw G_MUL [[SEXT]], [[C]] + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nusw inbounds G_PTR_ADD [[MV]], [[MUL]](s64) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(p1) = COPY [[PTR_ADD]](p1) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<33 x s32>) = G_LOAD [[COPY4]](p1) :: (load (<33 x s32>) from %ir.gep, align 256, addrspace 1) ; CHECK-NEXT: G_STORE [[LOAD]](<33 x s32>), [[COPY]](p5) :: (store (<33 x s32>), align 256, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir index 6a4522f..d69a3e1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir @@ -141,11 +141,11 @@ body: | ; SIVI-NEXT: {{ $}} ; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(p5) = COPY $vgpr0 + ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5) ; SIVI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) ; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68 ; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY2]], [[C]](s64) ; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) - ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p5) ; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) ; SIVI-NEXT: [[C1:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 ; SIVI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 @@ -157,9 +157,9 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 + ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5) ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) - ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p5) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(p5) = G_CONSTANT i32 -1 ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 @@ -210,11 +210,11 @@ body: | ; SIVI-NEXT: {{ $}} ; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3) ; SIVI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) ; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 ; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY2]], [[C]](s64) ; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4) - ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY1]](p3) ; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) ; SIVI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; SIVI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 @@ -226,9 +226,9 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3) ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) - ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 @@ -354,20 +354,20 @@ body: | ; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 ; SIVI-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY1]](<2 x p3>) + ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) ; SIVI-NEXT: [[COPY2:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) ; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 ; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY2]], [[C]](s64) ; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4) - ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) ; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) ; SIVI-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; SIVI-NEXT: [[C2:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; SIVI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C1]] ; SIVI-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C2]] + ; SIVI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) ; SIVI-NEXT: [[COPY3:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) ; SIVI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY3]], [[C]](s64) ; SIVI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (dereferenceable invariant load (s32), align 64, addrspace 4) - ; SIVI-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) ; SIVI-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[LOAD1]](s32) ; SIVI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C1]] ; SIVI-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C2]] @@ -379,17 +379,17 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[UV:%[0-9]+]]:_(p3), [[UV1:%[0-9]+]]:_(p3) = G_UNMERGE_VALUES [[COPY]](<2 x p3>) + ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) - ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV]](p3) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV3]](s32) ; GFX9-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 -1 ; GFX9-NEXT: [[C1:%[0-9]+]]:_(p0) = G_CONSTANT i64 0 ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](p3), [[C]] ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[ICMP]](s1), [[MV]], [[C1]] + ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) ; GFX9-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_shared_base ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_1]](s64) - ; GFX9-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s32) = G_PTRTOINT [[UV1]](p3) ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT1]](s32), [[UV5]](s32) ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](p3), [[C]] ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(p0) = G_SELECT [[ICMP1]](s1), [[MV1]], [[C1]] @@ -506,19 +506,19 @@ body: | ; SIVI-NEXT: {{ $}} ; SIVI-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 ; SIVI-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5) ; SIVI-NEXT: [[COPY1:%[0-9]+]]:_(p4) = COPY [[COPY]](p4) ; SIVI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 68 ; SIVI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = nuw inbounds G_PTR_ADD [[COPY1]], [[C]](s64) ; SIVI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (dereferenceable invariant load (s32), addrspace 4) - ; SIVI-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5) ; SIVI-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[LOAD]](s32) ; SIVI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0) ; ; GFX9-LABEL: name: test_addrspacecast_p5_fi_to_p0 ; GFX9: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0 + ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5) ; GFX9-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64(s64) = S_MOV_B64 $src_private_base ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[S_MOV_B64_]](s64) - ; GFX9-NEXT: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[FRAME_INDEX]](p5) ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[PTRTOINT]](s32), [[UV1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p0) %0:_(p5) = G_FRAME_INDEX %stack.0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir index c231aa8..ee57b72 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir @@ -1090,3 +1090,24 @@ body: | $vgpr9_vgpr10_vgpr11 = COPY %8 ... + +--- +name: test_unmerge_through_copy +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: test_unmerge_through_copy + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s16), %2:_(s16) = G_UNMERGE_VALUES %0:_(s32) + %3:_(s16) = COPY %1:_(s16) + %4:_(s8), %5:_(s8) = G_UNMERGE_VALUES %3:_(s16) + %6:_(s32) = G_ZEXT %4:_(s8) + $vgpr0 = COPY %6:_(s32) +... diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll new file mode 100644 index 0000000..4b6375c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +; Test code sequences for addrspacecast with globally addressable scratch. + +target triple = "amdgcn-amd-amdhsa" + +define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) { +; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_lshlrev_b32 v1, 20, v0 +; GFX1250-SDAG-NEXT: s_cmp_lg_u32 s2, -1 +; GFX1250-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0 +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1 +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX1250-GISEL-NEXT: s_and_b32 s0, 1, s0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo +; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1 +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_endpgm + %stof = addrspacecast ptr addrspace(5) %ptr to ptr + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspace(5) %ptr) { +; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast_nonnull: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 20, v0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] +; GFX1250-SDAG-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v2, 20, v2 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo +; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v3 scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_endpgm + %stof = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p5(ptr addrspace(5) %ptr) + store volatile i32 0, ptr %stof + ret void +} + +define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) { +; GFX1250-LABEL: use_flat_to_private_addrspacecast: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-NEXT: s_cselect_b32 s0, s2, -1 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm + %ftos = addrspacecast ptr %ptr to ptr addrspace(5) + store volatile i32 0, ptr addrspace(5) %ftos + ret void +} + +define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) { +; GFX1250-SDAG-LABEL: use_flat_to_private_addrspacecast_nonnull: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1250-SDAG-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: use_flat_to_private_addrspacecast_nonnull: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s0, s1 +; GFX1250-GISEL-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: s_endpgm + %ftos = call ptr addrspace(5) @llvm.amdgcn.addrspacecast.nonnull.p5.p0(ptr %ptr) + store volatile i32 0, ptr addrspace(5) %ftos + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index 2ad7818..243f0ed 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -25,8 +25,11 @@ define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addr ; GCN-NEXT: s_load_b32 s0, s[4:5], 0x2c ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_cmp_eq_u32 s0, 0 -; GCN-NEXT: s_cbranch_scc1 .LBB0_2 -; GCN-NEXT: ; %bb.1: ; %bb2 +; GCN-NEXT: s_cbranch_scc0 .LBB0_1 +; GCN-NEXT: ; %bb.3: ; %bb +; GCN-NEXT: s_add_pc_i64 .LBB0_2-.Lpost_addpc0 +; GCN-NEXT: .Lpost_addpc0: +; GCN-NEXT: .LBB0_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: v_nop_e64 @@ -64,8 +67,8 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrs ; GCN-NEXT: s_cmp_eq_u32 s0, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB1_1 ; GCN-NEXT: ; %bb.3: ; %bb0 -; GCN-NEXT: s_add_pc_i64 .LBB1_2-.Lpost_addpc0 -; GCN-NEXT: .Lpost_addpc0: +; GCN-NEXT: s_add_pc_i64 .LBB1_2-.Lpost_addpc1 +; GCN-NEXT: .Lpost_addpc1: ; GCN-NEXT: .LBB1_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_nop_e64 @@ -106,8 +109,8 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr ; GCN-NEXT: s_cmp_eq_f32 s0, 0 ; GCN-NEXT: s_cbranch_scc0 .LBB2_1 ; GCN-NEXT: ; %bb.3: ; %bb0 -; GCN-NEXT: s_add_pc_i64 .LBB2_2-.Lpost_addpc1 -; GCN-NEXT: .Lpost_addpc1: +; GCN-NEXT: s_add_pc_i64 .LBB2_2-.Lpost_addpc2 +; GCN-NEXT: .Lpost_addpc2: ; GCN-NEXT: .LBB2_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; 32 bytes @@ -157,8 +160,8 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2 ; GCN-NEXT: s_cbranch_execnz .LBB3_1 ; GCN-NEXT: ; %bb.3: ; %bb -; GCN-NEXT: s_add_pc_i64 .LBB3_2-.Lpost_addpc2 -; GCN-NEXT: .Lpost_addpc2: +; GCN-NEXT: s_add_pc_i64 .LBB3_2-.Lpost_addpc3 +; GCN-NEXT: .Lpost_addpc3: ; GCN-NEXT: .LBB3_1: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; 32 bytes @@ -209,8 +212,8 @@ define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 { ; GCN-NEXT: s_cbranch_scc0 .LBB4_2 ; GCN-NEXT: ; %bb.3: ; %bb2 ; GCN-NEXT: ; in Loop: Header=BB4_1 Depth=1 -; GCN-NEXT: s_add_pc_i64 .LBB4_1-.Lpost_addpc3 -; GCN-NEXT: .Lpost_addpc3: +; GCN-NEXT: s_add_pc_i64 .LBB4_1-.Lpost_addpc4 +; GCN-NEXT: .Lpost_addpc4: ; GCN-NEXT: .LBB4_2: ; %bb3 ; GCN-NEXT: s_endpgm bb: @@ -242,8 +245,8 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_mov_b32 s0, -1 ; GCN-NEXT: s_cbranch_scc0 .LBB5_1 ; GCN-NEXT: ; %bb.7: ; %bb0 -; GCN-NEXT: s_add_pc_i64 .LBB5_4-.Lpost_addpc5 -; GCN-NEXT: .Lpost_addpc5: +; GCN-NEXT: s_add_pc_i64 .LBB5_4-.Lpost_addpc6 +; GCN-NEXT: .Lpost_addpc6: ; GCN-NEXT: .LBB5_1: ; %Flow ; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GCN-NEXT: s_cbranch_vccnz .LBB5_3 @@ -268,11 +271,11 @@ define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr add ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: s_cbranch_execnz .LBB5_5 ; GCN-NEXT: ; %bb.9: ; %bb3 -; GCN-NEXT: s_add_pc_i64 .LBB5_2-.Lpost_addpc6 -; GCN-NEXT: .Lpost_addpc6: +; GCN-NEXT: s_add_pc_i64 .LBB5_2-.Lpost_addpc7 +; GCN-NEXT: .Lpost_addpc7: ; GCN-NEXT: .LBB5_5: ; %bb3 -; GCN-NEXT: s_add_pc_i64 .LBB5_3-.Lpost_addpc4 -; GCN-NEXT: .Lpost_addpc4: +; GCN-NEXT: s_add_pc_i64 .LBB5_3-.Lpost_addpc5 +; GCN-NEXT: .Lpost_addpc5: bb0: %tmp = icmp ne i32 %arg1, 0 br i1 %tmp, label %bb2, label %bb3 @@ -310,8 +313,8 @@ define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr ad ; GCN-NEXT: s_cbranch_vccz .LBB6_2 ; GCN-NEXT: ; %bb.3: ; %loop ; GCN-NEXT: ; in Loop: Header=BB6_1 Depth=1 -; GCN-NEXT: s_add_pc_i64 .LBB6_1-.Lpost_addpc7 -; GCN-NEXT: .Lpost_addpc7: +; GCN-NEXT: s_add_pc_i64 .LBB6_1-.Lpost_addpc8 +; GCN-NEXT: .Lpost_addpc8: ; GCN-NEXT: .LBB6_2: ; %DummyReturnBlock ; GCN-NEXT: s_endpgm entry: @@ -350,8 +353,8 @@ define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 { ; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GCN-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-NEXT: ; %bb.5: ; %Flow -; GCN-NEXT: s_add_pc_i64 .LBB7_4-.Lpost_addpc8 -; GCN-NEXT: .Lpost_addpc8: +; GCN-NEXT: s_add_pc_i64 .LBB7_4-.Lpost_addpc9 +; GCN-NEXT: .Lpost_addpc9: ; GCN-NEXT: .LBB7_3: ; %bb2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_nop_e64 @@ -406,8 +409,8 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 % ; GCN-NEXT: v_cmpx_gt_u32_e32 16, v0 ; GCN-NEXT: s_cbranch_execnz .LBB8_1 ; GCN-NEXT: ; %bb.4: ; %entry -; GCN-NEXT: s_add_pc_i64 .LBB8_3-.Lpost_addpc9 -; GCN-NEXT: .Lpost_addpc9: +; GCN-NEXT: s_add_pc_i64 .LBB8_3-.Lpost_addpc10 +; GCN-NEXT: .Lpost_addpc10: ; GCN-NEXT: .LBB8_1: ; %if ; GCN-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 @@ -465,8 +468,8 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GCN-NEXT: s_cbranch_execnz .LBB9_3 ; GCN-NEXT: ; %bb.6: ; %Flow1 -; GCN-NEXT: s_add_pc_i64 .LBB9_5-.Lpost_addpc10 -; GCN-NEXT: .Lpost_addpc10: +; GCN-NEXT: s_add_pc_i64 .LBB9_5-.Lpost_addpc11 +; GCN-NEXT: .Lpost_addpc11: ; GCN-NEXT: .LBB9_3: ; %loop.preheader ; GCN-NEXT: s_mov_b32 vcc_lo, 0 ; GCN-NEXT: .LBB9_4: ; %loop @@ -484,8 +487,8 @@ define amdgpu_kernel void @analyze_mask_branch() #0 { ; GCN-NEXT: s_cbranch_vccnz .LBB9_5 ; GCN-NEXT: ; %bb.8: ; %loop ; GCN-NEXT: ; in Loop: Header=BB9_4 Depth=1 -; GCN-NEXT: s_add_pc_i64 .LBB9_4-.Lpost_addpc11 -; GCN-NEXT: .Lpost_addpc11: +; GCN-NEXT: s_add_pc_i64 .LBB9_4-.Lpost_addpc12 +; GCN-NEXT: .Lpost_addpc12: ; GCN-NEXT: .LBB9_5: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm entry: @@ -528,20 +531,20 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_cmp_lt_i32 s3, 6 ; GCN-NEXT: s_cbranch_scc0 .LBB10_1 ; GCN-NEXT: ; %bb.10: ; %bb -; GCN-NEXT: s_add_pc_i64 .LBB10_4-.Lpost_addpc13 -; GCN-NEXT: .Lpost_addpc13: +; GCN-NEXT: s_add_pc_i64 .LBB10_4-.Lpost_addpc14 +; GCN-NEXT: .Lpost_addpc14: ; GCN-NEXT: .LBB10_1: ; %Flow ; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 ; GCN-NEXT: s_cbranch_vccnz .LBB10_2 ; GCN-NEXT: ; %bb.12: ; %Flow -; GCN-NEXT: s_add_pc_i64 .LBB10_5-.Lpost_addpc14 -; GCN-NEXT: .Lpost_addpc14: +; GCN-NEXT: s_add_pc_i64 .LBB10_5-.Lpost_addpc15 +; GCN-NEXT: .Lpost_addpc15: ; GCN-NEXT: .LBB10_2: ; %Flow5 ; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GCN-NEXT: s_cbranch_vccz .LBB10_3 ; GCN-NEXT: ; %bb.14: ; %Flow5 -; GCN-NEXT: s_add_pc_i64 .LBB10_6-.Lpost_addpc15 -; GCN-NEXT: .Lpost_addpc15: +; GCN-NEXT: s_add_pc_i64 .LBB10_6-.Lpost_addpc16 +; GCN-NEXT: .Lpost_addpc16: ; GCN-NEXT: .LBB10_3: ; %bb14 ; GCN-NEXT: s_cmp_lt_i32 s1, 9 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 @@ -553,8 +556,8 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GCN-NEXT: ; %bb.8: ; %bb14 -; GCN-NEXT: s_add_pc_i64 .LBB10_7-.Lpost_addpc12 -; GCN-NEXT: .Lpost_addpc12: +; GCN-NEXT: s_add_pc_i64 .LBB10_7-.Lpost_addpc13 +; GCN-NEXT: .Lpost_addpc13: ; GCN-NEXT: .LBB10_4: ; %bb13 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: v_nop_e64 @@ -565,8 +568,8 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_sleep 0 ; GCN-NEXT: s_cbranch_execz .LBB10_5 ; GCN-NEXT: ; %bb.16: ; %bb13 -; GCN-NEXT: s_add_pc_i64 .LBB10_2-.Lpost_addpc16 -; GCN-NEXT: .Lpost_addpc16: +; GCN-NEXT: s_add_pc_i64 .LBB10_2-.Lpost_addpc17 +; GCN-NEXT: .Lpost_addpc17: ; GCN-NEXT: .LBB10_5: ; %bb9 ; GCN-NEXT: s_cmp_lt_i32 s3, 11 ; GCN-NEXT: s_cselect_b32 s0, -1, 0 @@ -577,8 +580,8 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GCN-NEXT: s_cbranch_vccnz .LBB10_6 ; GCN-NEXT: ; %bb.18: ; %bb9 -; GCN-NEXT: s_add_pc_i64 .LBB10_3-.Lpost_addpc17 -; GCN-NEXT: .Lpost_addpc17: +; GCN-NEXT: s_add_pc_i64 .LBB10_3-.Lpost_addpc18 +; GCN-NEXT: .Lpost_addpc18: ; GCN-NEXT: .LBB10_6: ; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: .LBB10_7: ; %bb19 diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll index bdb52db..d1a1112 100644 --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -1,8 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16,GFX11-FAKE16-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16,GFX11-FAKE16-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16,GFX11-TRUE16-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16,GFX11-TRUE16-GISEL %s define void @undef_lo_v2i16(i16 %arg0) { +; GFX8-SDAG-LABEL: undef_lo_v2i16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v0 +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_lo_v2i16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v0 +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_lo_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -12,20 +37,48 @@ define void @undef_lo_v2i16(i16 %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_lo_v2i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v0 -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_lo_v2i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: undef_lo_v2i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <2 x i16> poison, i16 %arg0, i32 1 call void asm sideeffect "; use $0", "v"(<2 x i16> %undef.lo); ret void } define void @undef_lo_v2f16(half %arg0) { +; GFX8-SDAG-LABEL: undef_lo_v2f16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v0 +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_lo_v2f16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v0 +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_lo_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -35,20 +88,52 @@ define void @undef_lo_v2f16(half %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_lo_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v0 -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_lo_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: undef_lo_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <2 x half> poison, half %arg0, i32 1 call void asm sideeffect "; use $0", "v"(<2 x half> %undef.lo); ret void } define void @undef_lo_op_v2f16(half %arg0) { +; GFX8-SDAG-LABEL: undef_lo_op_v2f16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX8-SDAG-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, 0x7e00, v0 +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v0 +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_lo_op_v2f16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00 +; GFX8-GISEL-NEXT: v_add_f16_e64 v1, s4, 1.0 +; GFX8-GISEL-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v0 +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_lo_op_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -59,16 +144,27 @@ define void @undef_lo_op_v2f16(half %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_lo_op_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, 0x3c00 -; GFX8-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, 0x7e00, v0 -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v0 -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_lo_op_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: undef_lo_op_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <2 x half> poison, half %arg0, i32 1 %op = fadd <2 x half> %undef.lo, <half 1.0, half 1.0> call void asm sideeffect "; use $0", "v"(<2 x half> %op); @@ -76,26 +172,93 @@ define void @undef_lo_op_v2f16(half %arg0) { } define void @undef_lo_op_v2i16(i16 %arg0) { -; GFX9-LABEL: undef_lo_op_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x63 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0] -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: undef_lo_op_v2i16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX8-SDAG-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v0 +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_lo_op_v2i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, 0x63 -; GFX8-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v0 -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-GISEL-LABEL: undef_lo_op_v2i16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0x63 +; GFX8-GISEL-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-GISEL-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v0 +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: undef_lo_op_v2i16: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x63 +; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: ;;#ASMSTART +; GFX9-SDAG-NEXT: ; use v0 +; GFX9-SDAG-NEXT: ;;#ASMEND +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: undef_lo_op_v2i16: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x630063 +; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-GISEL-NEXT: ;;#ASMSTART +; GFX9-GISEL-NEXT: ; use v0 +; GFX9-GISEL-NEXT: ;;#ASMEND +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-SDAG-LABEL: undef_lo_op_v2i16: +; GFX11-FAKE16-SDAG: ; %bb.0: +; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-FAKE16-SDAG-NEXT: ; use v0 +; GFX11-FAKE16-SDAG-NEXT: ;;#ASMEND +; GFX11-FAKE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-GISEL-LABEL: undef_lo_op_v2i16: +; GFX11-FAKE16-GISEL: ; %bb.0: +; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-GISEL-NEXT: v_pk_add_u16 v0, 0x630063, v0 +; GFX11-FAKE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-FAKE16-GISEL-NEXT: ; use v0 +; GFX11-FAKE16-GISEL-NEXT: ;;#ASMEND +; GFX11-FAKE16-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-SDAG-LABEL: undef_lo_op_v2i16: +; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-TRUE16-SDAG-NEXT: ; use v0 +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND +; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-GISEL-LABEL: undef_lo_op_v2i16: +; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-GISEL-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-GISEL-NEXT: v_pk_add_u16 v0, 0x630063, v0 +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-TRUE16-GISEL-NEXT: ; use v0 +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND +; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <2 x i16> poison, i16 %arg0, i32 1 %op = add <2 x i16> %undef.lo, <i16 99, i16 99> call void asm sideeffect "; use $0", "v"(<2 x i16> %op); @@ -103,6 +266,26 @@ define void @undef_lo_op_v2i16(i16 %arg0) { } define void @undef_lo3_v4i16(i16 %arg0) { +; GFX8-SDAG-LABEL: undef_lo3_v4i16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v[0:1] +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_lo3_v4i16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v[0:1] +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_lo3_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -112,20 +295,49 @@ define void @undef_lo3_v4i16(i16 %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_lo3_v4i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v[0:1] -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_lo3_v4i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v[0:1] +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: undef_lo3_v4i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v[0:1] +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <4 x i16> poison, i16 %arg0, i32 1 call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.lo); ret void } define void @undef_lo3_v4f16(half %arg0) { +; GFX8-SDAG-LABEL: undef_lo3_v4f16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v[0:1] +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_lo3_v4f16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v[0:1] +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_lo3_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -135,20 +347,50 @@ define void @undef_lo3_v4f16(half %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_lo3_v4f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v[0:1] -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_lo3_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v[0:1] +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: undef_lo3_v4f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v[0:1] +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] %undef.lo = insertelement <4 x half> poison, half %arg0, i32 1 call void asm sideeffect "; use $0", "v"(<4 x half> %undef.lo); ret void } define void @undef_lo2_v4i16(<2 x i16> %arg0) { +; GFX8-SDAG-LABEL: undef_lo2_v4i16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-SDAG-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v[0:1] +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_lo2_v4i16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-GISEL-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v[0:1] +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_lo2_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -159,21 +401,62 @@ define void @undef_lo2_v4i16(<2 x i16> %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_lo2_v4i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v[0:1] -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_lo2_v4i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v[0:1] +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-SDAG-LABEL: undef_lo2_v4i16: +; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-TRUE16-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-TRUE16-SDAG-NEXT: ; use v[0:1] +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND +; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-GISEL-LABEL: undef_lo2_v4i16: +; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-GISEL-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-TRUE16-GISEL-NEXT: ; use v[0:1] +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND +; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31] %undef.lo = shufflevector <2 x i16> %arg0, <2 x i16> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3> call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.lo); ret void } define void @undef_lo2_v4f16(<2 x half> %arg0) { +; GFX8-SDAG-LABEL: undef_lo2_v4f16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-SDAG-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v[0:1] +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_lo2_v4f16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-GISEL-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v[0:1] +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_lo2_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -184,21 +467,57 @@ define void @undef_lo2_v4f16(<2 x half> %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_lo2_v4f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v[0:1] -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-FAKE16-LABEL: undef_lo2_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v0, v0, 0x7060302 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v[0:1] +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-SDAG-LABEL: undef_lo2_v4f16: +; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-TRUE16-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-TRUE16-SDAG-NEXT: ; use v[0:1] +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND +; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-GISEL-LABEL: undef_lo2_v4f16: +; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-GISEL-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-TRUE16-GISEL-NEXT: ; use v[0:1] +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND +; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31] %undef.lo = shufflevector <2 x half> %arg0, <2 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 3> call void asm sideeffect "; use $0", "v"(<4 x half> %undef.lo); ret void } define void @undef_hi_v2i16(i16 %arg0) { +; GFX8-SDAG-LABEL: undef_hi_v2i16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v0 +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_hi_v2i16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v0 +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_hi_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -207,19 +526,36 @@ define void @undef_hi_v2i16(i16 %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_hi_v2i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v0 -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: undef_hi_v2i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <2 x i16> poison, i16 %arg0, i32 0 call void asm sideeffect "; use $0", "v"(<2 x i16> %undef.hi); ret void } define void @undef_hi_v2f16(half %arg0) { +; GFX8-SDAG-LABEL: undef_hi_v2f16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v0 +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_hi_v2f16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v0 +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_hi_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -228,19 +564,42 @@ define void @undef_hi_v2f16(half %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_hi_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v0 -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: undef_hi_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <2 x half> poison, half %arg0, i32 0 call void asm sideeffect "; use $0", "v"(<2 x half> %undef.hi); ret void } define void @undef_hi_op_v2f16(half %arg0) { +; GFX8-SDAG-LABEL: undef_hi_op_v2f16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v0 +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_hi_op_v2f16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00 +; GFX8-GISEL-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX8-GISEL-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v0 +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_hi_op_v2f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -250,15 +609,14 @@ define void @undef_hi_op_v2f16(half %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_hi_op_v2f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v0 -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: undef_hi_op_v2f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <2 x half> poison, half %arg0, i32 0 %op = fadd <2 x half> %undef.hi, <half 1.0, half 1.0> call void asm sideeffect "; use $0", "v"(<2 x half> %op); @@ -266,24 +624,82 @@ define void @undef_hi_op_v2f16(half %arg0) { } define void @undef_hi_op_v2i16(i16 %arg0) { -; GFX9-LABEL: undef_hi_op_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x63 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0] -; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 -; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX8-SDAG-LABEL: undef_hi_op_v2i16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_add_u16_e32 v0, 0x63, v0 +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v0 +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_hi_op_v2i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v0, 0x63, v0 -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v0 -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX8-GISEL-LABEL: undef_hi_op_v2i16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-GISEL-NEXT: v_add_u16_e32 v0, 0x63, v0 +; GFX8-GISEL-NEXT: s_lshl_b32 s4, s4, 16 +; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s4, v0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v0 +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-SDAG-LABEL: undef_hi_op_v2i16: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x63 +; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, s4 op_sel_hi:[1,0] +; GFX9-SDAG-NEXT: ;;#ASMSTART +; GFX9-SDAG-NEXT: ; use v0 +; GFX9-SDAG-NEXT: ;;#ASMEND +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: undef_hi_op_v2i16: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0x630063 +; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, v1 +; GFX9-GISEL-NEXT: ;;#ASMSTART +; GFX9-GISEL-NEXT: ; use v0 +; GFX9-GISEL-NEXT: ;;#ASMEND +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-SDAG-LABEL: undef_hi_op_v2i16: +; GFX11-FAKE16-SDAG: ; %bb.0: +; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1] +; GFX11-FAKE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-FAKE16-SDAG-NEXT: ; use v0 +; GFX11-FAKE16-SDAG-NEXT: ;;#ASMEND +; GFX11-FAKE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-GISEL-LABEL: undef_hi_op_v2i16: +; GFX11-FAKE16-GISEL: ; %bb.0: +; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-GISEL-NEXT: v_pk_add_u16 v0, 0x630063, v0 +; GFX11-FAKE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-FAKE16-GISEL-NEXT: ; use v0 +; GFX11-FAKE16-GISEL-NEXT: ;;#ASMEND +; GFX11-FAKE16-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-SDAG-LABEL: undef_hi_op_v2i16: +; GFX11-TRUE16-SDAG: ; %bb.0: +; GFX11-TRUE16-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-SDAG-NEXT: v_pk_add_u16 v0, 0x63, v0 op_sel_hi:[0,1] +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMSTART +; GFX11-TRUE16-SDAG-NEXT: ; use v0 +; GFX11-TRUE16-SDAG-NEXT: ;;#ASMEND +; GFX11-TRUE16-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-GISEL-LABEL: undef_hi_op_v2i16: +; GFX11-TRUE16-GISEL: ; %bb.0: +; GFX11-TRUE16-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-GISEL-NEXT: v_pk_add_u16 v0, 0x630063, v0 +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMSTART +; GFX11-TRUE16-GISEL-NEXT: ; use v0 +; GFX11-TRUE16-GISEL-NEXT: ;;#ASMEND +; GFX11-TRUE16-GISEL-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <2 x i16> poison, i16 %arg0, i32 0 %op = add <2 x i16> %undef.hi, <i16 99, i16 99> call void asm sideeffect "; use $0", "v"(<2 x i16> %op); @@ -291,6 +707,24 @@ define void @undef_hi_op_v2i16(i16 %arg0) { } define void @undef_hi3_v4i16(i16 %arg0) { +; GFX8-SDAG-LABEL: undef_hi3_v4i16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v[0:1] +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_hi3_v4i16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v[0:1] +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_hi3_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -299,19 +733,37 @@ define void @undef_hi3_v4i16(i16 %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_hi3_v4i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v[0:1] -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: undef_hi3_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v[0:1] +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <4 x i16> poison, i16 %arg0, i32 0 call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.hi); ret void } define void @undef_hi3_v4f16(half %arg0) { +; GFX8-SDAG-LABEL: undef_hi3_v4f16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v[0:1] +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_hi3_v4f16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v[0:1] +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_hi3_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -320,19 +772,39 @@ define void @undef_hi3_v4f16(half %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_hi3_v4f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v[0:1] -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: undef_hi3_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v[0:1] +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_setpc_b64 s[30:31] %undef.hi = insertelement <4 x half> poison, half %arg0, i32 0 call void asm sideeffect "; use $0", "v"(<4 x half> %undef.hi); ret void } define void @undef_hi2_v4i16(<2 x i16> %arg0) { +; GFX8-SDAG-LABEL: undef_hi2_v4i16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v[0:1] +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_hi2_v4i16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v[0:1] +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_hi2_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -341,19 +813,39 @@ define void @undef_hi2_v4i16(<2 x i16> %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_hi2_v4i16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v[0:1] -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: undef_hi2_v4i16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v[0:1] +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_setpc_b64 s[30:31] %undef.hi = shufflevector <2 x i16> %arg0, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> call void asm sideeffect "; use $0", "v"(<4 x i16> %undef.hi); ret void } define void @undef_hi2_v4f16(<2 x half> %arg0) { +; GFX8-SDAG-LABEL: undef_hi2_v4f16: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: ;;#ASMSTART +; GFX8-SDAG-NEXT: ; use v[0:1] +; GFX8-SDAG-NEXT: ;;#ASMEND +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: undef_hi2_v4f16: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-GISEL-NEXT: ;;#ASMSTART +; GFX8-GISEL-NEXT: ; use v[0:1] +; GFX8-GISEL-NEXT: ;;#ASMEND +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; ; GFX9-LABEL: undef_hi2_v4f16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -362,15 +854,16 @@ define void @undef_hi2_v4f16(<2 x half> %arg0) { ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: undef_hi2_v4f16: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use v[0:1] -; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: undef_hi2_v4f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: ;;#ASMSTART +; GFX11-NEXT: ; use v[0:1] +; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: s_setpc_b64 s[30:31] %undef.hi = shufflevector <2 x half> %arg0, <2 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3> call void asm sideeffect "; use $0", "v"(<4 x half> %undef.hi); ret void } - +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX8: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index 1b9b508..cefcbdd 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -457,27 +457,58 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; EG-LABEL: v_ctpop_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 7, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T6.X, 1 +; EG-NEXT: ALU 37, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: VTX_READ_64 T8.XY, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: LSHR * T0.W, T0.X, literal.x, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: AND_INT * T0.W, T8.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: LSHR * T0.W, T8.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.Y, PV.W, -; EG-NEXT: AND_INT * T0.W, T0.X, literal.x, +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.X, PV.W, -; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV * T0.X, T5.X, +; EG-NEXT: AND_INT * T0.W, T8.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: LSHR * T0.W, T8.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T8.Y, T1.W, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV * T8.X, T4.X, %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid %val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16 @@ -570,33 +601,94 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; ; EG-LABEL: v_ctpop_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 13, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T8.X, 1 +; EG-NEXT: ALU 73, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: LSHR * T0.W, T0.Z, literal.x, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: LSHR * T0.W, T12.X, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT * T0.W, PV.W, +; EG-NEXT: LSHL T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T0.W, T12.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV * T0.X, T5.X, +; EG-NEXT: LSHR * T0.W, T12.Y, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.Z, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.Z, PS, -; EG-NEXT: LSHR * T1.W, T0.X, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T0.W, T12.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.Y, PS, PV.W, +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV * T0.X, T8.X, +; EG-NEXT: LSHR * T0.W, T12.Z, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.Y, PV.W, +; EG-NEXT: BCNT_INT T0.W, PV.W, ; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T0.X, PV.W, -; EG-NEXT: LSHR * T8.X, KC0[2].Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV * T0.X, T9.X, +; EG-NEXT: LSHR * T0.W, T12.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T0.W, T12.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: LSHR T12.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T0.W, PS, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV * T0.X, T4.X, +; EG-NEXT: MOV * T0.Z, T8.X, %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <8 x i16>, ptr addrspace(1) %in, i32 %tid %val = load <8 x i16>, ptr addrspace(1) %in.gep, align 32 @@ -745,46 +837,174 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; ; EG-LABEL: v_ctpop_v16i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 2, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 25, @13, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T14.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T13.X, 1 +; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @8 +; EG-NEXT: ALU 114, @16, KC0[], KC1[] +; EG-NEXT: ALU 34, @131, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T12.XYZW, T0.X, 16, #1 -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_128 T20.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T21.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 5(7.006492e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 13: -; EG-NEXT: LSHR * T0.W, T12.Z, literal.x, +; EG-NEXT: ALU clause starting at 16: +; EG-NEXT: LSHR * T0.W, T20.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T12.W, PV.W, -; EG-NEXT: AND_INT * T0.W, T12.Z, literal.x, +; EG-NEXT: BCNT_INT * T0.W, PV.W, +; EG-NEXT: LSHL T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T0.W, T20.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV * T0.X, T5.X, +; EG-NEXT: LSHR * T0.W, T20.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T0.W, T20.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.Y, PS, PV.W, +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV * T0.X, T8.X, +; EG-NEXT: LSHR * T0.W, T20.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T12.Z, PS, -; EG-NEXT: LSHR T0.W, T0.Z, literal.x, -; EG-NEXT: LSHR * T1.W, T12.X, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T0.W, T20.Z, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV * T0.X, T9.X, +; EG-NEXT: LSHR * T0.W, T20.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T0.W, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T12.Y, PS, -; EG-NEXT: AND_INT T0.Z, T0.Z, literal.x, +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T0.W, T20.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) ; EG-NEXT: BCNT_INT T0.W, PV.W, -; EG-NEXT: AND_INT * T1.W, T12.X, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV * T0.X, T12.X, +; EG-NEXT: LSHR * T1.W, T21.X, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T1.W, PV.W, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, +; EG-NEXT: MOV * T12.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T1.W, T21.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T1.W, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PS, PV.W, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV * T0.X, T13.X, +; EG-NEXT: LSHR * T1.W, T21.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T1.W, PV.W, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, +; EG-NEXT: MOV * T13.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T1.W, T21.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T1.W, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T20.Y, PS, PV.W, +; EG-NEXT: MOV T13.X, PV.Y, +; EG-NEXT: MOV * T0.X, T16.X, +; EG-NEXT: LSHR * T1.W, T21.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T1.W, PV.W, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, +; EG-NEXT: ALU clause starting at 131: +; EG-NEXT: MOV * T16.X, T1.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT * T1.W, T21.Z, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T1.W, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PS, PV.W, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV * T0.X, T17.X, +; EG-NEXT: LSHR * T1.W, T21.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BCNT_INT T1.W, PV.W, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: BCNT_INT T12.X, PS, -; EG-NEXT: BCNT_INT T0.Z, PV.Z, -; EG-NEXT: LSHR T1.W, T0.X, literal.x, -; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x, +; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T13.X, PS, literal.x, -; EG-NEXT: BCNT_INT T0.Y, PV.W, -; EG-NEXT: AND_INT * T1.W, T0.X, literal.y, -; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) -; EG-NEXT: BCNT_INT T0.X, PV.W, -; EG-NEXT: LSHR * T14.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, +; EG-NEXT: MOV * T17.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: AND_INT T1.W, T21.W, literal.x, +; EG-NEXT: LSHR * T21.X, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) +; EG-NEXT: AND_INT T0.Z, PV.X, literal.x, +; EG-NEXT: BCNT_INT T1.W, PV.W, +; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y, +; EG-NEXT: -65536(nan), 16(2.242078e-44) +; EG-NEXT: LSHR T22.X, PS, literal.x, +; EG-NEXT: OR_INT * T20.W, PV.Z, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T17.X, PV.W, +; EG-NEXT: MOV * T0.X, T4.X, +; EG-NEXT: MOV * T0.Z, T8.X, +; EG-NEXT: MOV T20.X, T12.X, +; EG-NEXT: MOV * T20.Z, T16.X, BS:VEC_120/SCL_212 %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <16 x i16>, ptr addrspace(1) %in, i32 %tid %val = load <16 x i16>, ptr addrspace(1) %in.gep, align 32 @@ -1292,7 +1512,7 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace( ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; SI-NEXT: .LBB14_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_branch .LBB14_2 ; ; VI-LABEL: ctpop_i16_in_br: diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index e6c38d2..747affa 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -495,8 +495,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 @@ -2697,19 +2698,16 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/empty-text.ll b/llvm/test/CodeGen/AMDGPU/empty-text.ll new file mode 100644 index 0000000..8aa8600 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/empty-text.ll @@ -0,0 +1,9 @@ +; Test that there is no s_code_end padding if .text is otherwise empty. + +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN + +@globalVar = global i32 37 + +declare amdgpu_ps void @funcDecl() + +; GCN-NOT: .fill diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 7524750..5fb50d0 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -2608,9 +2608,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 { ; GFX11-TRUE16-LABEL: v_test_canonicalize_undef_reg_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.l, v0.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_test_canonicalize_undef_reg_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 2ff66c9..7d36c9f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -252,13 +252,15 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -277,9 +279,11 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -292,15 +296,16 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -314,13 +319,16 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2 ; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -344,11 +352,13 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -367,8 +377,11 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -381,18 +394,19 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -406,13 +420,16 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2 ; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -433,11 +450,13 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -455,9 +474,11 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -465,13 +486,14 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -483,14 +505,17 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2 ; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -508,10 +533,12 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -529,8 +556,11 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_endpgm ; @@ -538,16 +568,17 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -559,14 +590,17 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2 ; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 @@ -642,13 +676,15 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -667,9 +703,11 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3] @@ -683,15 +721,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -705,13 +744,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2 ; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5] @@ -736,11 +778,13 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -759,8 +803,11 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3] @@ -774,18 +821,19 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -799,13 +847,16 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2 ; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[4:5] @@ -827,11 +878,13 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -849,9 +902,11 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3] @@ -862,13 +917,14 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -880,14 +936,17 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2 ; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] @@ -908,10 +967,12 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -929,8 +990,11 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3] @@ -941,16 +1005,17 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -962,14 +1027,17 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] @@ -1048,13 +1116,15 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1073,9 +1143,11 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3] @@ -1089,15 +1161,16 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -1111,13 +1184,16 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2 ; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5] @@ -1142,11 +1218,13 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1165,8 +1243,11 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[2:3] @@ -1180,18 +1261,19 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -1205,13 +1287,16 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2 ; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[2:3], v[0:1], v[4:5] @@ -1233,11 +1318,13 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1255,9 +1342,11 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3] @@ -1268,13 +1357,14 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -1286,14 +1376,17 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2 ; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] @@ -1314,10 +1407,12 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1335,8 +1430,11 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3] @@ -1347,16 +1445,17 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -1368,14 +1467,17 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2 ; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] @@ -1454,13 +1556,15 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1479,9 +1583,11 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 @@ -1496,15 +1602,16 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -1518,13 +1625,16 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2 ; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 @@ -1550,11 +1660,13 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1573,8 +1685,11 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 @@ -1589,18 +1704,19 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -1614,13 +1730,16 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2 ; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 @@ -1643,11 +1762,13 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1665,9 +1786,11 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 @@ -1679,13 +1802,14 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -1697,14 +1821,17 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2 ; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 @@ -1726,10 +1853,12 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1747,8 +1876,11 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 @@ -1760,16 +1892,17 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -1781,14 +1914,17 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2 ; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 @@ -1868,13 +2004,15 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1893,9 +2031,11 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 @@ -1910,15 +2050,16 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -1932,13 +2073,16 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2 ; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 @@ -1964,11 +2108,13 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -1987,8 +2133,11 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 @@ -2003,18 +2152,19 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2028,13 +2178,16 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2 ; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 @@ -2057,11 +2210,13 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2079,9 +2234,11 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 @@ -2093,13 +2250,14 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2111,14 +2269,17 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2 ; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 @@ -2140,10 +2301,12 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2161,8 +2324,11 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 @@ -2174,16 +2340,17 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2195,14 +2362,17 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2 ; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 @@ -2282,13 +2452,15 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2307,9 +2479,11 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 @@ -2324,15 +2498,16 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2346,13 +2521,16 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2 ; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 @@ -2378,11 +2556,13 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2401,8 +2581,11 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 @@ -2417,18 +2600,19 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2442,13 +2626,16 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2 ; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 @@ -2471,11 +2658,13 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2493,9 +2682,11 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 @@ -2507,13 +2698,14 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2525,14 +2717,17 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2 ; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 @@ -2554,10 +2749,12 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2575,8 +2772,11 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 @@ -2588,16 +2788,17 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2609,14 +2810,17 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2 ; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 @@ -2690,13 +2894,15 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2715,10 +2921,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_i64 v[2:3], v[0:1], v[2:3] @@ -2732,15 +2940,16 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2753,15 +2962,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 ; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_i64 v[2:3], v[0:1], v[4:5] @@ -2786,11 +2998,13 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2809,9 +3023,12 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_i64 v[2:3], v[0:1], v[2:3] @@ -2825,18 +3042,19 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2849,15 +3067,18 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 ; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_i64 v[2:3], v[0:1], v[4:5] @@ -2879,11 +3100,13 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2900,9 +3123,11 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3] @@ -2913,13 +3138,14 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -2930,14 +3156,17 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 ; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5] @@ -2958,10 +3187,12 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -2978,8 +3209,11 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_i64 v[0:1], v[0:1], v[2:3] @@ -2990,16 +3224,17 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3010,14 +3245,17 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 ; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5] @@ -3090,13 +3328,15 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3115,10 +3355,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_i64 v[2:3], v[0:1], v[2:3] @@ -3132,15 +3374,16 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3153,15 +3396,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 ; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_i64 v[2:3], v[0:1], v[4:5] @@ -3186,11 +3432,13 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3209,9 +3457,12 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_i64 v[2:3], v[0:1], v[2:3] @@ -3225,18 +3476,19 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3249,15 +3501,18 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 ; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_i64 v[2:3], v[0:1], v[4:5] @@ -3279,11 +3534,13 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3300,9 +3557,11 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3] @@ -3313,13 +3572,14 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3330,14 +3590,17 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 ; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5] @@ -3358,10 +3621,12 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3378,8 +3643,11 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_i64 v[0:1], v[0:1], v[2:3] @@ -3390,16 +3658,17 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3410,14 +3679,17 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 ; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5] @@ -3490,13 +3762,15 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3515,10 +3789,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_u64 v[2:3], v[0:1], v[2:3] @@ -3532,15 +3808,16 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3553,15 +3830,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 ; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_u64 v[2:3], v[0:1], v[4:5] @@ -3586,11 +3866,13 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3609,9 +3891,12 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_u64 v[2:3], v[0:1], v[2:3] @@ -3625,18 +3910,19 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3649,15 +3935,18 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 ; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_u64 v[2:3], v[0:1], v[4:5] @@ -3679,11 +3968,13 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3700,9 +3991,11 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3] @@ -3713,13 +4006,14 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3730,14 +4024,17 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 ; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5] @@ -3758,10 +4055,12 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3778,8 +4077,11 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_max_u64 v[0:1], v[0:1], v[2:3] @@ -3790,16 +4092,17 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3810,14 +4113,17 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 ; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5] @@ -3890,13 +4196,15 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -3915,10 +4223,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_u64 v[2:3], v[0:1], v[2:3] @@ -3932,15 +4242,16 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -3953,15 +4264,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 ; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_u64 v[2:3], v[0:1], v[4:5] @@ -3986,11 +4300,13 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4009,9 +4325,12 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_u64 v[2:3], v[0:1], v[2:3] @@ -4025,18 +4344,19 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4049,15 +4369,18 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 ; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_u64 v[2:3], v[0:1], v[4:5] @@ -4079,11 +4402,13 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4100,9 +4425,11 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3] @@ -4113,13 +4440,14 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4130,14 +4458,17 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 ; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5] @@ -4158,10 +4489,12 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4178,8 +4511,11 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_min_u64 v[0:1], v[0:1], v[2:3] @@ -4190,16 +4526,17 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4210,14 +4547,17 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128 ; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 ; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5] @@ -4310,14 +4650,16 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4338,9 +4680,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] @@ -4356,15 +4700,16 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v7, v4 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4380,13 +4725,16 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2 ; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] @@ -4414,11 +4762,13 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[2:3], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4439,8 +4789,11 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] @@ -4456,18 +4809,19 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v7, v4 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_bitop2_b32 v0, s0, v3 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4483,13 +4837,16 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2 ; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] @@ -4512,13 +4869,15 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 -; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4538,9 +4897,11 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] @@ -4553,13 +4914,14 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4573,14 +4935,17 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2 ; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] @@ -4603,10 +4968,12 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v2 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4626,8 +4993,11 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v2, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] @@ -4640,16 +5010,17 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4663,14 +5034,17 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SYS ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2 ; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] @@ -4742,13 +5116,15 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4766,15 +5142,16 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -4786,15 +5163,16 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4806,21 +5184,24 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_branch .LBB98_5 ; GFX1250-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2 ; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -4843,11 +5224,13 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4865,14 +5248,16 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 @@ -4884,18 +5269,19 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4907,21 +5293,24 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_branch .LBB99_5 ; GFX1250-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2 ; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 @@ -4941,11 +5330,13 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -4961,14 +5352,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_endpgm @@ -4977,13 +5369,14 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -4993,20 +5386,23 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2 ; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm @@ -5025,10 +5421,12 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5044,13 +5442,15 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 1, v[0:1] ; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4 ; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE ; GFX1250-SDAG-NEXT: s_endpgm @@ -5059,16 +5459,17 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -5078,20 +5479,23 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2 ; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] ; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm @@ -5161,13 +5565,15 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn: ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[2:3], v[0:1] +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5185,10 +5591,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -5207,15 +5615,16 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -5227,15 +5636,18 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_branch .LBB106_5 ; GFX1250-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2 ; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -5265,11 +5677,13 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, s0, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5287,9 +5701,12 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v0, s0, v4 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -5308,18 +5725,19 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14 +; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -5331,15 +5749,18 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_branch .LBB107_5 ; GFX1250-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2 ; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo ; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -5366,11 +5787,13 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5386,9 +5809,11 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -5404,13 +5829,14 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -5420,14 +5846,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2 ; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -5453,10 +5882,12 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v4, s0, v1 ; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: v_cmpx_lt_u32_e32 0x3ffffff, v4 ; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3 ; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow @@ -5472,8 +5903,11 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] -; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: v_subrev_nc_u32_e32 v4, s0, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] @@ -5489,16 +5923,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base -; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 ; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3 +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1 ; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3 ; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow @@ -5508,14 +5943,17 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: s_endpgm ; GFX1250-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global ; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV -; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2 ; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 ; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2 ; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo ; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index b25d9b2..fc88839 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -3621,7 +3621,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_movk_i32 s0, 0x3004 +; GFX9-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-NEXT: s_add_i32 s0, s0, 4 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3637,7 +3638,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; GFX10-NEXT: v_mov_b32_e32 v0, 13 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_movk_i32 s0, 0x3804 +; GFX10-NEXT: s_movk_i32 s0, 0x3800 +; GFX10-NEXT: s_add_i32 s0, s0, 4 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3682,7 +3684,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3004 +; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 +; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3716,8 +3719,9 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0 -; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3804 +; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3739,7 +3743,8 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() { ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3804 +; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 +; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3785,10 +3790,12 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-LABEL: store_load_large_imm_offset_foo: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-NEXT: s_add_i32 s1, s32, s0 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s32, 0x3004 +; GFX9-NEXT: s_add_i32 s0, s1, 4 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -3800,8 +3807,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-NEXT: s_add_i32 s0, s32, 0x3804 +; GFX10-NEXT: s_add_i32 s1, s32, s0 +; GFX10-NEXT: s_add_i32 s0, s1, 4 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 @@ -3843,10 +3852,12 @@ define void @store_load_large_imm_offset_foo() { ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: ; GFX9-PAL: ; %bb.0: ; %bb ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX9-PAL-NEXT: s_add_i32 s1, s32, s0 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x3004 +; GFX9-PAL-NEXT: s_add_i32 s0, s1, 4 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3872,8 +3883,10 @@ define void @store_load_large_imm_offset_foo() { ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 +; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x3804 +; GFX10-PAL-NEXT: s_add_i32 s1, s32, s0 +; GFX10-PAL-NEXT: s_add_i32 s0, s1, 4 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll index 0adbecd..e59fbad 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll @@ -173,8 +173,8 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b ; ; GFX12-GISEL-TRUE16-LABEL: test_fmaximum_v3f16_vv: ; GFX12-GISEL-TRUE16: ; %bb.0: -; GFX12-GISEL-TRUE16-NEXT: v_maximum_f16 v1.l, v1.l, v3.l ; GFX12-GISEL-TRUE16-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-GISEL-TRUE16-NEXT: v_maximum_f16 v1.l, v1.l, v3.l ; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-FAKE16-LABEL: test_fmaximum_v3f16_vv: diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll index e1d35b5..b25120f 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll @@ -173,8 +173,8 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b ; ; GFX12-GISEL-TRUE16-LABEL: test_fminimum_v3f16_vv: ; GFX12-GISEL-TRUE16: ; %bb.0: -; GFX12-GISEL-TRUE16-NEXT: v_minimum_f16 v1.l, v1.l, v3.l ; GFX12-GISEL-TRUE16-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-GISEL-TRUE16-NEXT: v_minimum_f16 v1.l, v1.l, v3.l ; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-FAKE16-LABEL: test_fminimum_v3f16_vv: diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll index ceacdf5..cbda062 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -1,45 +1,184 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-FASTFMA,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,SI-DENORM,GCN-DENORM-SLOWFMA,SI %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-FASTFMA,SI-DENORM-FASTFMA-STRICT %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-SLOWFMA %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD,SI-FLUSH,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-FASTFMA,SI-DENORM-FASTFMA-CONTRACT %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI-DENORM-SLOWFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FLUSH,GFX9-FLUSH-MAD %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DENORM,GFX9-DENORM-FASTFMA-MAD %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-MAD %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-FLUSH,GFX9-FLUSH-FMAC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DENORM,GFX9-DENORM-FASTFMA-FMAC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s - -; FIXME: Should probably test this, but sometimes selecting fmac is painful to match. -; XUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM-FASTFMA %s - -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-FMAC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1030 -denormal-fp-math-f32=ieee -mattr=+mad-mac-f32-insts -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10-DENORM %s ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. target triple = "amdgcn--" - declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fmuladd.f32(float, float, float) #1 declare half @llvm.fmuladd.f16(half, half, half) #1 declare float @llvm.fabs.f32(float) #1 -; GCN-LABEL: {{^}}fmuladd_f32: -; GCN-FLUSH-MAD: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; GCN-FLUSH-FMAC: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; GCN-DENORM-FASTFMA: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+}} - -; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} -define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, - ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +; SI-FLUSH-LABEL: fmuladd_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s10, -1 +; SI-FLUSH-NEXT: s_mov_b32 s14, s10 +; SI-FLUSH-NEXT: s_mov_b32 s15, s11 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s12, s2 +; SI-FLUSH-NEXT: s_mov_b32 s13, s3 +; SI-FLUSH-NEXT: s_mov_b32 s16, s4 +; SI-FLUSH-NEXT: s_mov_b32 s17, s5 +; SI-FLUSH-NEXT: s_mov_b32 s18, s10 +; SI-FLUSH-NEXT: s_mov_b32 s19, s11 +; SI-FLUSH-NEXT: s_mov_b32 s4, s6 +; SI-FLUSH-NEXT: s_mov_b32 s5, s7 +; SI-FLUSH-NEXT: s_mov_b32 s6, s10 +; SI-FLUSH-NEXT: s_mov_b32 s7, s11 +; SI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-FLUSH-NEXT: s_mov_b32 s8, s0 +; SI-FLUSH-NEXT: s_mov_b32 s9, s1 +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-FLUSH-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(1) +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-FLUSH-MAD-NEXT: global_load_dword v3, v0, s[14:15] +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v3, v0, s[14:15] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, v2, v3 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v3, v0, s[14:15] +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[10:11] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[12:13] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v3, v0, s[14:15] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX10-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-NEXT: s_endpgm %r0 = load float, ptr addrspace(1) %in1 %r1 = load float, ptr addrspace(1) %in2 %r2 = load float, ptr addrspace(1) %in3 @@ -48,18 +187,190 @@ define amdgpu_kernel void @fmuladd_f32(ptr addrspace(1) %out, ptr addrspace(1) % ret void } -; GCN-LABEL: {{^}}fmul_fadd_f32: -; GCN-FLUSH: v_mac_f32 - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 - -; GCN-DENORM-STRICT: v_mul_f32_e32 -; GCN-DENORM-STRICT: v_add_f32_e32 -define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, - ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +; SI-FLUSH-LABEL: fmul_fadd_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s10, -1 +; SI-FLUSH-NEXT: s_mov_b32 s14, s10 +; SI-FLUSH-NEXT: s_mov_b32 s15, s11 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s12, s2 +; SI-FLUSH-NEXT: s_mov_b32 s13, s3 +; SI-FLUSH-NEXT: s_mov_b32 s16, s4 +; SI-FLUSH-NEXT: s_mov_b32 s17, s5 +; SI-FLUSH-NEXT: s_mov_b32 s18, s10 +; SI-FLUSH-NEXT: s_mov_b32 s19, s11 +; SI-FLUSH-NEXT: s_mov_b32 s4, s6 +; SI-FLUSH-NEXT: s_mov_b32 s5, s7 +; SI-FLUSH-NEXT: s_mov_b32 s6, s10 +; SI-FLUSH-NEXT: s_mov_b32 s7, s11 +; SI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s8, s0 +; SI-FLUSH-NEXT: s_mov_b32 s9, s1 +; SI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-FLUSH-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fmul_fadd_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmul_fadd_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fmul_fadd_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v0, v0, v1, v2 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fmul_fadd_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fmul_fadd_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fmul_fadd_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX10-FLUSH-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fmul_fadd_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %r0 = load volatile float, ptr addrspace(1) %in1 %r1 = load volatile float, ptr addrspace(1) %in2 %r2 = load volatile float, ptr addrspace(1) %in3 @@ -69,15 +380,172 @@ define amdgpu_kernel void @fmul_fadd_f32(ptr addrspace(1) %out, ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}fmul_fadd_contract_f32: -; GCN-FLUSH-FMAC: v_fmac_f32_e32 - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 - -; GCN-DENORM-FASTFMA: v_fma_f32 -define amdgpu_kernel void @fmul_fadd_contract_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, - ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +define amdgpu_kernel void @fmul_fadd_contract_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 { +; SI-FLUSH-LABEL: fmul_fadd_contract_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s11, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s10, -1 +; SI-FLUSH-NEXT: s_mov_b32 s14, s10 +; SI-FLUSH-NEXT: s_mov_b32 s15, s11 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s12, s2 +; SI-FLUSH-NEXT: s_mov_b32 s13, s3 +; SI-FLUSH-NEXT: s_mov_b32 s16, s4 +; SI-FLUSH-NEXT: s_mov_b32 s17, s5 +; SI-FLUSH-NEXT: s_mov_b32 s18, s10 +; SI-FLUSH-NEXT: s_mov_b32 s19, s11 +; SI-FLUSH-NEXT: s_mov_b32 s4, s6 +; SI-FLUSH-NEXT: s_mov_b32 s5, s7 +; SI-FLUSH-NEXT: s_mov_b32 s6, s10 +; SI-FLUSH-NEXT: s_mov_b32 s7, s11 +; SI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b32 s8, s0 +; SI-FLUSH-NEXT: s_mov_b32 s9, s1 +; SI-FLUSH-NEXT: v_mac_f32_e32 v2, v0, v1 +; SI-FLUSH-NEXT: buffer_store_dword v2, off, s[8:11], 0 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmul_fadd_contract_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v0, v0, v1, v2 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmul_fadd_contract_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s11, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s10, -1 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s14, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s15, s11 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s12, s2 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s13, s3 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s16, s4 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s17, s5 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s18, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s19, s11 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s4, s6 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s5, s7 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, s10 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, s11 +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s8, s0 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s9, s1 +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v0, v0, v1 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v0, v0, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmul_fadd_contract_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmul_fadd_contract_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, v2, v3 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[8:9] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmul_fadd_contract_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmul_fadd_contract_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[10:11] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[12:13] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v3, v0, s[14:15] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v3, s[8:9] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmul_fadd_contract_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[4:5] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX10-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-NEXT: s_endpgm %r0 = load volatile float, ptr addrspace(1) %in1 %r1 = load volatile float, ptr addrspace(1) %in2 %r2 = load volatile float, ptr addrspace(1) %in3 @@ -87,23 +555,120 @@ define amdgpu_kernel void @fmul_fadd_contract_f32(ptr addrspace(1) %out, ptr add ret void } -; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f32 -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_2.0_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_2.0_a_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_2.0_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_2.0_a_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_2.0_a_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, 2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_2.0_a_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_2.0_a_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_2.0_a_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -117,24 +682,120 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f32 -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_a_2.0_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_a_2.0_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_a_2.0_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_a_2.0_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_a_2.0_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_a_2.0_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, 2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_a_2.0_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_a_2.0_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_a_2.0_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -148,28 +809,126 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}fadd_a_a_b_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @fadd_a_a_b_f32(ptr addrspace(1) %out, - ptr addrspace(1) %in1, - ptr addrspace(1) %in2) #0 { +define amdgpu_kernel void @fadd_a_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; SI-FLUSH-LABEL: fadd_a_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fadd_a_a_b_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fadd_a_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fadd_a_a_b_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fadd_a_a_b_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fadd_a_a_b_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fadd_a_a_b_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX10-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fadd_a_a_b_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -184,28 +943,126 @@ define amdgpu_kernel void @fadd_a_a_b_f32(ptr addrspace(1) %out, ret void } -; GCN-LABEL: {{^}}fadd_b_a_a_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH: v_mac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @fadd_b_a_a_f32(ptr addrspace(1) %out, - ptr addrspace(1) %in1, - ptr addrspace(1) %in2) #0 { +define amdgpu_kernel void @fadd_b_a_a_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1, ptr addrspace(1) %in2) #0 { +; SI-FLUSH-LABEL: fadd_b_a_a_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fadd_b_a_a_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fadd_b_a_a_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fadd_b_a_a_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fadd_b_a_a_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fadd_b_a_a_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fadd_b_a_a_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX10-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fadd_b_a_a_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -220,20 +1077,120 @@ define amdgpu_kernel void @fadd_b_a_a_f32(ptr addrspace(1) %out, ret void } -; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, -2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_neg_2.0_a_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, -2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_neg_2.0_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, -2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_neg_2.0_a_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -247,25 +1204,120 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(ptr addrspace(1) %out, ptr ad ret void } -; XXX -; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], 2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], 2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, 2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, 2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_neg_2.0_neg_a_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, 2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -281,24 +1333,120 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, pt ret void } -; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], - -; GCN-FLUSH-MAD: v_mac_f32_e32 [[R2]], -2.0, [[R1]] -; GCN-FLUSH-FMAC: v_fmac_f32_e32 [[R2]], -2.0, [[R1]] - -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, -2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_2.0_neg_a_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, -2.0, v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_2.0_neg_a_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-MAD-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX9-DENORM-FASTFMA-MAD: ; %bb.0: +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-MAD-NEXT: v_fma_f32 v1, v1, -2.0, v2 +; GFX9-DENORM-FASTFMA-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-FASTFMA-MAD-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX9-DENORM-FASTFMA-FMAC-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX9-DENORM-FASTFMA-FMAC: ; %bb.0: +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-FASTFMA-FMAC-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX9-DENORM-FASTFMA-FMAC-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-DENORM-FASTFMA-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_2.0_neg_a_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fmac_f32_e32 v2, -2.0, v1 +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -314,23 +1462,107 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad ret void } -; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] -; GCN-FLUSH-FMAC: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; SI-FLUSH: buffer_store_dword [[RESULT]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] - -; GCN-DENORM-FASTFMA: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; GCN-DENORM-SLOWFMA: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, 2.0, -v3 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-LABEL: fmuladd_2.0_a_neg_b_f32: +; SI-DENORM-FASTFMA: ; %bb.0: +; SI-DENORM-FASTFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-NEXT: v_fma_f32 v2, v2, 2.0, -v3 +; SI-DENORM-FASTFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fmuladd_2.0_a_neg_b_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; GFX9-FLUSH-MAD-LABEL: fmuladd_2.0_a_neg_b_f32: +; GFX9-FLUSH-MAD: ; %bb.0: +; GFX9-FLUSH-MAD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-MAD-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-MAD-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-MAD-NEXT: v_mad_f32 v1, v1, 2.0, -v2 +; GFX9-FLUSH-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-MAD-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fmuladd_2.0_a_neg_b_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f32 v1, v1, 2.0, -v2 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX9-FLUSH-FMAC-LABEL: fmuladd_2.0_a_neg_b_f32: +; GFX9-FLUSH-FMAC: ; %bb.0: +; GFX9-FLUSH-FMAC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-FMAC-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-FMAC-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-FMAC-NEXT: v_fma_f32 v1, v1, 2.0, -v2 +; GFX9-FLUSH-FMAC-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-FMAC-NEXT: s_endpgm +; +; GFX10-LABEL: fmuladd_2.0_a_neg_b_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_fma_f32 v1, v1, 2.0, -v2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -346,23 +1578,150 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(ptr addrspace(1) %out, ptr ad ret void } -; GCN-LABEL: {{^}}mad_sub_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_sub_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, v3, -v4 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v2, v4 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_sub_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v4 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, -v4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_sub_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -v3 +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_sub_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -v3 +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_sub_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -380,24 +1739,150 @@ define amdgpu_kernel void @mad_sub_f32(ptr addrspace(1) noalias nocapture %out, ret void } -; GCN-LABEL: {{^}}mad_sub_inv_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] - -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_inv_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_sub_inv_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, -v2, v3, v4 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_inv_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v4, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_sub_inv_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v4, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_inv_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, -v2, v3, v4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_sub_inv_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, v3 +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_sub_inv_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v3, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_inv_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, v3 +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_sub_inv_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v3, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -415,23 +1900,150 @@ define amdgpu_kernel void @mad_sub_inv_f32(ptr addrspace(1) noalias nocapture %o ret void } -; GCN-LABEL: {{^}}mad_sub_fabs_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]| - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_sub_fabs_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, v3, -|v4| +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_fabs_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e64 v2, v2, |v4| +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_sub_fabs_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e64 v2, v2, |v4| +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_fabs_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, -|v4| +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_sub_fabs_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -|v3| +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_sub_fabs_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_sub_f32_e64 v1, v1, |v3| +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_fabs_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, v2, -|v3| +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_sub_fabs_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_sub_f32_e64 v1, v1, |v3| +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -450,24 +2062,150 @@ define amdgpu_kernel void @mad_sub_fabs_f32(ptr addrspace(1) noalias nocapture % ret void } -; GCN-LABEL: {{^}}mad_sub_fabs_inv_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH-MAD: v_mad_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| -; GCN-FLUSH-FMA: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]| - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_sub_f32_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_inv_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_sub_fabs_inv_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, -v2, v3, |v4| +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_sub_fabs_inv_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e64 v2, |v4|, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_sub_fabs_inv_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e64 v2, |v4|, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_sub_fabs_inv_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, -v2, v3, |v4| +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_sub_fabs_inv_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, |v3| +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_sub_fabs_inv_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_sub_f32_e64 v1, |v3|, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v2, |v3| +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_sub_fabs_inv_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_sub_f32_e64 v1, |v3|, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -486,26 +2224,150 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f32(ptr addrspace(1) noalias nocaptu ret void } -; GCN-LABEL: {{^}}neg_neg_mad_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] - -; GCN-FLUSH: v_mac_f32_e32 [[REGC]], [[REGA]], [[REGB]] -; SI-FLUSH: buffer_store_dword [[REGC]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] - -; GCN-DENORM-STRICT: v_mul_f32_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] -; GCN-DENORM-STRICT: v_add_f32_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @neg_neg_mad_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: neg_neg_mad_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mac_f32_e32 v4, v2, v3 +; SI-FLUSH-NEXT: buffer_store_dword v4, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: neg_neg_mad_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v4, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: neg_neg_mad_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v4, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: neg_neg_mad_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, v3, v4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: neg_neg_mad_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX9-FLUSH-NEXT: global_store_dword v0, v3, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: neg_neg_mad_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: neg_neg_mad_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v3, v1, v2 +; GFX10-FLUSH-NEXT: global_store_dword v0, v3, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: neg_neg_mad_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -525,23 +2387,150 @@ define amdgpu_kernel void @neg_neg_mad_f32(ptr addrspace(1) noalias nocapture %o ret void } -; GCN-LABEL: {{^}}mad_fabs_sub_f32: -; GCN: {{buffer|flat|global}}_load_dword [[REGA:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGB:v[0-9]+]] -; GCN: {{buffer|flat|global}}_load_dword [[REGC:v[0-9]+]] -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; GCN-DENORM-STRICT: v_mul_f32_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @mad_fabs_sub_f32(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #0 { +; SI-FLUSH-LABEL: mad_fabs_sub_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s7, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s6, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, |v3|, -v4 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: mad_fabs_sub_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mul_f32_e64 v2, v2, |v3| +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v2, v4 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: mad_fabs_sub_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-SLOWFMA-NEXT: v_mul_f32_e64 v2, v2, |v3| +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v4 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: mad_fabs_sub_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s7, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s6, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:8 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, |v3|, -v4 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: mad_fabs_sub_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, |v2|, -v3 +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: mad_fabs_sub_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_mul_f32_e64 v1, v1, |v2| +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: mad_fabs_sub_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, |v2|, -v3 +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: mad_fabs_sub_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_mul_f32_e64 v1, v1, |v2| +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 %gep0 = getelementptr float, ptr addrspace(1) %ptr, i64 %tid.ext @@ -560,24 +2549,126 @@ define amdgpu_kernel void @mad_fabs_sub_f32(ptr addrspace(1) noalias nocapture % ret void } -; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH: v_mac_f32_e32 [[R2]], -2.0, [[R1]] -; SI-FLUSH: buffer_store_dword [[R2]] -; VI-FLUSH: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[R2]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] - -; SI-DENORM: buffer_store_dword [[RESULT]] -; VI-DENORM: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_c_fadd_a_a_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fsub_c_fadd_a_a_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mac_f32_e32 v3, -2.0, v2 +; SI-FLUSH-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fsub_c_fadd_a_a_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fsub_c_fadd_a_a_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v3, v2 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fsub_c_fadd_a_a_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, -2.0, v3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fsub_c_fadd_a_a_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, -2.0, v1 +; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fsub_c_fadd_a_a_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mac_f32_e32 v2, -2.0, v1 +; GFX10-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fsub_c_fadd_a_a_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 @@ -593,22 +2684,126 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f32(ptr addrspace(1) %out, ptr addrsp ret void } -; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f32: -; GCN: {{buffer|flat|global}}_load_dword [[R1:v[0-9]+]], -; GCN: {{buffer|flat|global}}_load_dword [[R2:v[0-9]+]], -; GCN-FLUSH: v_mad_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; GCN-DENORM-FASTFMA-CONTRACT: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]] - -; GCN-DENORM-SLOWFMA-CONTRACT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-SLOWFMA-CONTRACT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; GCN-DENORM-STRICT: v_add_f32_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] -; GCN-DENORM-STRICT: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] - -; SI: buffer_store_dword [[RESULT]] -; VI: {{global|flat}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] define amdgpu_kernel void @fsub_fadd_a_a_c_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-FLUSH-LABEL: fsub_fadd_a_a_c_f32: +; SI-FLUSH: ; %bb.0: +; SI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-FLUSH-NEXT: s_mov_b32 s3, 0xf000 +; SI-FLUSH-NEXT: s_mov_b32 s2, 0 +; SI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-FLUSH-NEXT: v_mov_b32_e32 v1, 0 +; SI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; SI-FLUSH-NEXT: v_mad_f32 v2, v2, 2.0, -v3 +; SI-FLUSH-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-FLUSH-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-STRICT-LABEL: fsub_fadd_a_a_c_f32: +; SI-DENORM-FASTFMA-STRICT: ; %bb.0: +; SI-DENORM-FASTFMA-STRICT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-STRICT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-STRICT-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-FASTFMA-STRICT-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-DENORM-FASTFMA-STRICT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-STRICT-NEXT: s_endpgm +; +; SI-DENORM-SLOWFMA-LABEL: fsub_fadd_a_a_c_f32: +; SI-DENORM-SLOWFMA: ; %bb.0: +; SI-DENORM-SLOWFMA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-SLOWFMA-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-SLOWFMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-SLOWFMA-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-SLOWFMA-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-SLOWFMA-NEXT: v_add_f32_e32 v2, v2, v2 +; SI-DENORM-SLOWFMA-NEXT: v_sub_f32_e32 v2, v2, v3 +; SI-DENORM-SLOWFMA-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-SLOWFMA-NEXT: s_endpgm +; +; SI-DENORM-FASTFMA-CONTRACT-LABEL: fsub_fadd_a_a_c_f32: +; SI-DENORM-FASTFMA-CONTRACT: ; %bb.0: +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s3, 0xf000 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_mov_b32 s2, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_mov_b32_e32 v1, 0 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 glc +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_waitcnt vmcnt(0) +; SI-DENORM-FASTFMA-CONTRACT-NEXT: v_fma_f32 v2, v2, 2.0, -v3 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-DENORM-FASTFMA-CONTRACT-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: fsub_fadd_a_a_c_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mad_f32 v1, v1, 2.0, -v2 +; GFX9-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: fsub_fadd_a_a_c_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX9-DENORM-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f32: +; GFX10-FLUSH: ; %bb.0: +; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLUSH-NEXT: v_mad_f32 v1, v1, 2.0, -v2 +; GFX10-FLUSH-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: s_endpgm +; +; GFX10-DENORM-LABEL: fsub_fadd_a_a_c_f32: +; GFX10-DENORM: ; %bb.0: +; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v1, v0, s[0:1] glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: global_load_dword v2, v0, s[0:1] offset:4 glc dlc +; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX10-DENORM-NEXT: v_add_f32_e32 v1, v1, v1 +; GFX10-DENORM-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX10-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-DENORM-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir index 7fad2f4..a88b1ec 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index.mir @@ -75,7 +75,8 @@ stack: body: | bb.0: ; CHECK-LABEL: name: fold_frame_index__s_add_i32__fi_materializedconst_0 - ; CHECK: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, 256, implicit-def $scc + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 256 + ; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]], implicit-def $scc ; CHECK-NEXT: $sgpr4 = COPY [[S_ADD_I32_]] ; CHECK-NEXT: SI_RETURN implicit $sgpr4 %0:sreg_32 = S_MOV_B32 %stack.0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir index cc43142..2f2d727 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir @@ -46,7 +46,8 @@ body: | %2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc ... # GCN-LABEL: name: test_frameindex{{$}} -# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, 70 +# GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 70 +# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %stack.0, [[S_MOV_B32_]] --- name: test_frameindex tracksRegLiveness: true diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index 15cda62..f2fe61f 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -360,7 +360,8 @@ entry: ; s_add_i32. ; GCN-LABEL: {{^}}fi_sop2_s_add_u32_literal_error: -; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0, 0x2010 +; GCN: s_movk_i32 [[S_MOVK_I32_:s[0-9]+]], 0x1000 +; GCN: s_add_u32 [[ADD_LO:s[0-9]+]], 0x1010, [[S_MOVK_I32_]] ; GCN: s_addc_u32 [[ADD_HI:s[0-9]+]], s{{[0-9]+}}, 0 define amdgpu_kernel void @fi_sop2_s_add_u32_literal_error() #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index ac4f0df..308e86b 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -5692,10 +5692,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -5725,10 +5721,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -6351,10 +6343,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) @@ -6384,10 +6372,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0 -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v4, v0 ; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4 ; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -12347,14 +12331,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX6-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX6-SDAG-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX6-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2 ; GFX6-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GFX6-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2 ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -12392,14 +12371,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX7-SDAG-NEXT: s_mov_b32 s5, s6 ; GFX7-SDAG-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2 ; GFX7-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; GFX7-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2 ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -12474,11 +12448,7 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: global_load_dword v0, v[0:1], off ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_lshrrev_b16 v1, 8, v0 -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-SDAG-NEXT: global_store_byte v[2:3], v4, off offset:2 +; GFX10-SDAG-NEXT: global_store_byte_d16_hi v[2:3], v0, off offset:2 ; GFX10-SDAG-NEXT: global_store_short v[2:3], v0, off ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -12499,36 +12469,15 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { ; GFX10-GISEL-NEXT: global_store_byte_d16_hi v[2:3], v0, off offset:2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-TRUE16-LABEL: freeze_v3i8: -; GFX11-SDAG-TRUE16: ; %bb.0: -; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: global_load_b32 v1, v[0:1], off -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1 -; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[2:3], v4, off offset:2 -; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v[2:3], v0, off -; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-SDAG-FAKE16-LABEL: freeze_v3i8: -; GFX11-SDAG-FAKE16: ; %bb.0: -; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b16 v1, 8, v0 -; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v0 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 -; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX11-SDAG-FAKE16-NEXT: s_clause 0x1 -; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[2:3], v0, off offset:2 -; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v1, off -; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-LABEL: freeze_v3i8: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: global_store_d16_hi_b8 v[2:3], v0, off offset:2 +; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: freeze_v3i8: ; GFX11-GISEL: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index 723e3ef..1602e31 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4326,9 +4326,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(ptr addrspace( ; ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_undef_hi: ; GFX12-GISEL-TRUE16: ; %bb.0: -; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] +; GFX12-GISEL-TRUE16-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] ; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_undef_hi: @@ -4366,9 +4365,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(ptr ; ; GFX12-GISEL-TRUE16-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: ; GFX12-GISEL-TRUE16: ; %bb.0: -; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 +; GFX12-GISEL-TRUE16-NEXT: global_load_d16_hi_b16 v0, v0, s[2:3] offset:-128 ; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: diff --git a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll index 1c298014..3001248 100644 --- a/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/issue130120-eliminate-frame-index.ll @@ -6,16 +6,24 @@ define amdgpu_gfx [13 x i32] @issue130120() { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: s_add_i32 s0, s32, 0xf0 -; CHECK-NEXT: s_add_i32 s1, s32, 0xf4 -; CHECK-NEXT: s_add_i32 s2, s32, 0xf8 -; CHECK-NEXT: s_add_i32 s3, s32, 0xfc +; CHECK-NEXT: s_movk_i32 s1, 0xf4 +; CHECK-NEXT: s_movk_i32 s2, 0xf8 +; CHECK-NEXT: s_movk_i32 s3, 0xfc +; CHECK-NEXT: s_movk_i32 s34, 0x100 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: s_add_i32 s34, s32, 0x100 -; CHECK-NEXT: s_add_i32 s35, s32, 0x104 -; CHECK-NEXT: s_add_i32 s36, s32, 0x108 -; CHECK-NEXT: s_add_i32 s37, s32, 0x110 -; CHECK-NEXT: s_add_i32 s38, s32, 0x120 +; CHECK-NEXT: s_movk_i32 s35, 0x104 +; CHECK-NEXT: s_movk_i32 s36, 0x108 +; CHECK-NEXT: s_movk_i32 s37, 0x110 +; CHECK-NEXT: s_movk_i32 s38, 0x120 +; CHECK-NEXT: s_add_i32 s0, s32, 0xf0 +; CHECK-NEXT: s_add_i32 s1, s32, s1 +; CHECK-NEXT: s_add_i32 s2, s32, s2 +; CHECK-NEXT: s_add_i32 s3, s32, s3 +; CHECK-NEXT: s_add_i32 s34, s32, s34 +; CHECK-NEXT: s_add_i32 s35, s32, s35 +; CHECK-NEXT: s_add_i32 s36, s32, s36 +; CHECK-NEXT: s_add_i32 s37, s32, s37 +; CHECK-NEXT: s_add_i32 s38, s32, s38 ; CHECK-NEXT: s_or_b32 s39, s32, 4 ; CHECK-NEXT: s_or_b32 s40, s32, 8 ; CHECK-NEXT: s_or_b32 s41, s32, 12 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index bad2e60..a2da887 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -1025,67 +1025,74 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; EG-LABEL: v3i16_arg: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 0, @10, KC0[], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 -; EG-NEXT: MEM_RAT MSKOR T2.XW, T0.X +; EG-NEXT: ALU 0, @12, KC0[], KC1[] +; EG-NEXT: TEX 2 @6 +; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 +; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T1.X, T0.X, 44, #3 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 48, #3 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.X, 0.0, -; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 +; EG-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 +; EG-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: MOV * T5.X, 0.0, +; EG-NEXT: ALU clause starting at 13: ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, +; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) ; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T2.X, T2.W, PV.W, -; EG-NEXT: LSHL * T2.W, literal.x, PV.W, +; EG-NEXT: LSHL T5.X, T2.W, PV.W, +; EG-NEXT: LSHL * T5.W, literal.x, PV.W, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MOV T2.Y, 0.0, -; EG-NEXT: MOV * T2.Z, 0.0, -; EG-NEXT: LSHR T0.X, T0.W, literal.x, -; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV T5.Y, 0.0, +; EG-NEXT: MOV * T5.Z, 0.0, +; EG-NEXT: LSHR T8.X, T0.W, literal.x, +; EG-NEXT: LSHL T0.W, T7.X, literal.y, +; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, +; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT T6.X, PV.W, PS, +; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: v3i16_arg: ; CM: ; %bb.0: ; %entry ; CM-NEXT: ALU 0, @12, KC0[], KC1[] -; CM-NEXT: TEX 0 @8 -; CM-NEXT: ALU 13, @13, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT MSKOR T1.XW, T2.X -; CM-NEXT: ALU 1, @27, KC0[CB0:0-32], KC1[] -; CM-NEXT: TEX 0 @10 -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: TEX 2 @6 +; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X ; CM-NEXT: CF_END -; CM-NEXT: Fetch clause starting at 8: -; CM-NEXT: VTX_READ_16 T1.X, T0.X, 48, #3 -; CM-NEXT: Fetch clause starting at 10: -; CM-NEXT: VTX_READ_16 T0.X, T0.X, 44, #3 +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 +; CM-NEXT: VTX_READ_16 T7.X, T5.X, 46, #3 +; CM-NEXT: VTX_READ_16 T5.X, T5.X, 48, #3 ; CM-NEXT: ALU clause starting at 12: -; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: MOV * T5.X, 0.0, ; CM-NEXT: ALU clause starting at 13: ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T1.X, literal.x, +; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, ; CM-NEXT: LSHL * T1.W, PV.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) -; CM-NEXT: LSHL T1.X, PV.Z, PV.W, -; CM-NEXT: LSHL * T1.W, literal.x, PV.W, +; CM-NEXT: LSHL T5.X, PV.Z, PV.W, +; CM-NEXT: LSHL * T5.W, literal.x, PV.W, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: MOV T1.Y, 0.0, -; CM-NEXT: MOV * T1.Z, 0.0, -; CM-NEXT: LSHR * T2.X, T0.W, literal.x, +; CM-NEXT: MOV T5.Y, 0.0, +; CM-NEXT: MOV * T5.Z, 0.0, +; CM-NEXT: LSHL T0.Z, T7.X, literal.x, +; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, +; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: ALU clause starting at 27: -; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: LSHR * T8.X, T0.W, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: store <3 x i16> %in, ptr addrspace(1) %out, align 4 @@ -2669,47 +2676,205 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) { ; ; EG-LABEL: v8i16_arg: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 0, @14, KC0[], KC1[] -; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 4, @15, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 +; EG-NEXT: ALU 1, @36, KC0[], KC1[] +; EG-NEXT: TEX 0 @20 +; EG-NEXT: ALU 5, @38, KC0[], KC1[] +; EG-NEXT: TEX 0 @22 +; EG-NEXT: ALU 5, @44, KC0[], KC1[] +; EG-NEXT: TEX 0 @24 +; EG-NEXT: ALU 5, @50, KC0[], KC1[] +; EG-NEXT: TEX 0 @26 +; EG-NEXT: ALU 5, @56, KC0[], KC1[] +; EG-NEXT: TEX 0 @28 +; EG-NEXT: ALU 5, @62, KC0[], KC1[] +; EG-NEXT: TEX 0 @30 +; EG-NEXT: ALU 5, @68, KC0[], KC1[] +; EG-NEXT: TEX 0 @32 +; EG-NEXT: ALU 5, @74, KC0[], KC1[] +; EG-NEXT: TEX 0 @34 +; EG-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T1.X, T0.X, 52, #3 -; EG-NEXT: VTX_READ_16 T2.X, T0.X, 54, #3 -; EG-NEXT: VTX_READ_16 T3.X, T0.X, 62, #3 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 -; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV * T0.X, 0.0, -; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: MOV T1.Y, T2.X, -; EG-NEXT: MOV * T1.Z, T0.X, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: MOV * T1.W, T3.X, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: Fetch clause starting at 20: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 +; EG-NEXT: Fetch clause starting at 22: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 +; EG-NEXT: Fetch clause starting at 24: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 +; EG-NEXT: Fetch clause starting at 26: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 +; EG-NEXT: Fetch clause starting at 28: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 +; EG-NEXT: Fetch clause starting at 30: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 +; EG-NEXT: Fetch clause starting at 32: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 +; EG-NEXT: Fetch clause starting at 34: +; EG-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 +; EG-NEXT: ALU clause starting at 36: +; EG-NEXT: MOV * T0.Y, T3.X, +; EG-NEXT: MOV * T7.X, 0.0, +; EG-NEXT: ALU clause starting at 38: +; EG-NEXT: LSHL T0.W, T8.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, T5.X, +; EG-NEXT: ALU clause starting at 44: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T8.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T0.Y, T3.X, +; EG-NEXT: ALU clause starting at 50: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, T5.X, +; EG-NEXT: ALU clause starting at 56: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: ALU clause starting at 62: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T8.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: ALU clause starting at 68: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T8.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: ALU clause starting at 74: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T8.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T7.Z, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.Z, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: ALU clause starting at 80: +; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, +; EG-NEXT: AND_INT * T1.W, T7.X, literal.z, +; EG-NEXT: 2(2.802597e-45), -65536(nan) +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T7.X, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.X, +; EG-NEXT: MOV * T7.W, T3.X, +; EG-NEXT: MOV * T7.Y, T5.X, ; ; CM-LABEL: v8i16_arg: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 0, @14, KC0[], KC1[] -; CM-NEXT: TEX 3 @6 -; CM-NEXT: ALU 4, @15, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X +; CM-NEXT: ALU 1, @36, KC0[], KC1[] +; CM-NEXT: TEX 0 @20 +; CM-NEXT: ALU 5, @38, KC0[], KC1[] +; CM-NEXT: TEX 0 @22 +; CM-NEXT: ALU 5, @44, KC0[], KC1[] +; CM-NEXT: TEX 0 @24 +; CM-NEXT: ALU 5, @50, KC0[], KC1[] +; CM-NEXT: TEX 0 @26 +; CM-NEXT: ALU 5, @56, KC0[], KC1[] +; CM-NEXT: TEX 0 @28 +; CM-NEXT: ALU 5, @62, KC0[], KC1[] +; CM-NEXT: TEX 0 @30 +; CM-NEXT: ALU 5, @68, KC0[], KC1[] +; CM-NEXT: TEX 0 @32 +; CM-NEXT: ALU 5, @74, KC0[], KC1[] +; CM-NEXT: TEX 0 @34 +; CM-NEXT: ALU 8, @80, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X ; CM-NEXT: CF_END ; CM-NEXT: PAD -; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_16 T1.X, T0.X, 52, #3 -; CM-NEXT: VTX_READ_16 T2.X, T0.X, 54, #3 -; CM-NEXT: VTX_READ_16 T3.X, T0.X, 62, #3 -; CM-NEXT: VTX_READ_16 T0.X, T0.X, 60, #3 -; CM-NEXT: ALU clause starting at 14: -; CM-NEXT: MOV * T0.X, 0.0, -; CM-NEXT: ALU clause starting at 15: -; CM-NEXT: MOV T1.Y, T2.X, -; CM-NEXT: MOV * T1.Z, T0.X, BS:VEC_120/SCL_212 -; CM-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; CM-NEXT: MOV * T1.W, T3.X, -; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: Fetch clause starting at 20: +; CM-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 +; CM-NEXT: Fetch clause starting at 22: +; CM-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 +; CM-NEXT: Fetch clause starting at 24: +; CM-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 +; CM-NEXT: Fetch clause starting at 26: +; CM-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 +; CM-NEXT: Fetch clause starting at 28: +; CM-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 +; CM-NEXT: Fetch clause starting at 30: +; CM-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 +; CM-NEXT: Fetch clause starting at 32: +; CM-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 +; CM-NEXT: Fetch clause starting at 34: +; CM-NEXT: VTX_READ_16 T7.X, T7.X, 52, #3 +; CM-NEXT: ALU clause starting at 36: +; CM-NEXT: MOV * T0.Y, T3.X, +; CM-NEXT: MOV * T7.X, 0.0, +; CM-NEXT: ALU clause starting at 38: +; CM-NEXT: LSHL T0.Z, T8.X, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV T3.X, PV.W, +; CM-NEXT: MOV * T0.Y, T5.X, +; CM-NEXT: ALU clause starting at 44: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T8.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T5.X, PV.W, +; CM-NEXT: MOV * T0.Y, T3.X, +; CM-NEXT: ALU clause starting at 50: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T3.X, PV.W, +; CM-NEXT: MOV * T0.Y, T5.X, +; CM-NEXT: ALU clause starting at 56: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T5.X, PV.W, +; CM-NEXT: MOV * T0.Y, T2.X, +; CM-NEXT: ALU clause starting at 62: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T8.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T2.X, PV.W, +; CM-NEXT: MOV * T0.Y, T4.X, +; CM-NEXT: ALU clause starting at 68: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T8.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV * T0.Y, T2.X, +; CM-NEXT: ALU clause starting at 74: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T8.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T7.Z, PV.Z, PV.W, +; CM-NEXT: MOV T2.X, PV.Z, +; CM-NEXT: MOV * T0.Y, T4.X, +; CM-NEXT: ALU clause starting at 80: +; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x, +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, +; CM-NEXT: AND_INT * T0.W, T7.X, literal.z, +; CM-NEXT: 2(2.802597e-45), -65536(nan) +; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; CM-NEXT: OR_INT * T7.X, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.X, +; CM-NEXT: MOV * T7.W, T3.X, +; CM-NEXT: MOV * T7.Y, T5.X, entry: store <8 x i16> %in, ptr addrspace(1) %out ret void @@ -3453,68 +3618,392 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) { ; ; EG-LABEL: v16i16_arg: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 0, @22, KC0[], KC1[] -; EG-NEXT: TEX 7 @6 -; EG-NEXT: ALU 10, @23, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T0.X, 1 +; EG-NEXT: ALU 1, @68, KC0[], KC1[] +; EG-NEXT: TEX 0 @36 +; EG-NEXT: ALU 5, @70, KC0[], KC1[] +; EG-NEXT: TEX 0 @38 +; EG-NEXT: ALU 5, @76, KC0[], KC1[] +; EG-NEXT: TEX 0 @40 +; EG-NEXT: ALU 5, @82, KC0[], KC1[] +; EG-NEXT: TEX 0 @42 +; EG-NEXT: ALU 5, @88, KC0[], KC1[] +; EG-NEXT: TEX 0 @44 +; EG-NEXT: ALU 5, @94, KC0[], KC1[] +; EG-NEXT: TEX 0 @46 +; EG-NEXT: ALU 5, @100, KC0[], KC1[] +; EG-NEXT: TEX 0 @48 +; EG-NEXT: ALU 5, @106, KC0[], KC1[] +; EG-NEXT: TEX 0 @50 +; EG-NEXT: ALU 5, @112, KC0[], KC1[] +; EG-NEXT: TEX 0 @52 +; EG-NEXT: ALU 5, @118, KC0[], KC1[] +; EG-NEXT: TEX 0 @54 +; EG-NEXT: ALU 5, @124, KC0[], KC1[] +; EG-NEXT: TEX 0 @56 +; EG-NEXT: ALU 5, @130, KC0[], KC1[] +; EG-NEXT: TEX 0 @58 +; EG-NEXT: ALU 5, @136, KC0[], KC1[] +; EG-NEXT: TEX 0 @60 +; EG-NEXT: ALU 5, @142, KC0[], KC1[] +; EG-NEXT: TEX 0 @62 +; EG-NEXT: ALU 5, @148, KC0[], KC1[] +; EG-NEXT: TEX 0 @64 +; EG-NEXT: ALU 5, @154, KC0[], KC1[] +; EG-NEXT: TEX 0 @66 +; EG-NEXT: ALU 13, @160, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T1.X, T0.X, 84, #3 -; EG-NEXT: VTX_READ_16 T2.X, T0.X, 86, #3 -; EG-NEXT: VTX_READ_16 T3.X, T0.X, 94, #3 -; EG-NEXT: VTX_READ_16 T4.X, T0.X, 78, #3 -; EG-NEXT: VTX_READ_16 T5.X, T0.X, 76, #3 -; EG-NEXT: VTX_READ_16 T6.X, T0.X, 92, #3 -; EG-NEXT: VTX_READ_16 T7.X, T0.X, 68, #3 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 70, #3 -; EG-NEXT: ALU clause starting at 22: -; EG-NEXT: MOV * T0.X, 0.0, -; EG-NEXT: ALU clause starting at 23: -; EG-NEXT: MOV T1.Y, T2.X, -; EG-NEXT: MOV * T7.Y, T0.X, -; EG-NEXT: MOV * T1.Z, T6.X, -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: MOV T7.Z, T5.X, +; EG-NEXT: Fetch clause starting at 36: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 +; EG-NEXT: Fetch clause starting at 38: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 +; EG-NEXT: Fetch clause starting at 40: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 +; EG-NEXT: Fetch clause starting at 42: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 +; EG-NEXT: Fetch clause starting at 44: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 +; EG-NEXT: Fetch clause starting at 46: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 +; EG-NEXT: Fetch clause starting at 48: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 +; EG-NEXT: Fetch clause starting at 50: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 +; EG-NEXT: Fetch clause starting at 52: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 +; EG-NEXT: Fetch clause starting at 54: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 +; EG-NEXT: Fetch clause starting at 56: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 +; EG-NEXT: Fetch clause starting at 58: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 +; EG-NEXT: Fetch clause starting at 60: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 +; EG-NEXT: Fetch clause starting at 62: +; EG-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 +; EG-NEXT: Fetch clause starting at 64: +; EG-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 +; EG-NEXT: Fetch clause starting at 66: +; EG-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 +; EG-NEXT: ALU clause starting at 68: +; EG-NEXT: MOV * T0.Y, T3.X, +; EG-NEXT: MOV * T11.X, 0.0, +; EG-NEXT: ALU clause starting at 70: +; EG-NEXT: LSHL T0.W, T12.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, T5.X, +; EG-NEXT: ALU clause starting at 76: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T12.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T0.Y, T7.X, +; EG-NEXT: ALU clause starting at 82: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T12.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV * T0.Y, T9.X, +; EG-NEXT: ALU clause starting at 88: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T12.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV * T0.Y, T3.X, +; EG-NEXT: ALU clause starting at 94: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, T5.X, +; EG-NEXT: ALU clause starting at 100: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T0.Y, T7.X, +; EG-NEXT: ALU clause starting at 106: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T7.X, PV.W, +; EG-NEXT: MOV * T0.Y, T9.X, +; EG-NEXT: ALU clause starting at 112: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.W, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: ALU clause starting at 118: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T12.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.W, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: ALU clause starting at 124: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T12.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV * T0.Y, T6.X, +; EG-NEXT: ALU clause starting at 130: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T12.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T6.X, PV.W, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: ALU clause starting at 136: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T1.W, T12.X, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: ALU clause starting at 142: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T12.Z, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.Z, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: ALU clause starting at 148: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T12.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T12.X, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.X, +; EG-NEXT: MOV * T0.Y, T6.X, +; EG-NEXT: ALU clause starting at 154: +; EG-NEXT: AND_INT T0.W, T0.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T13.X, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T11.Z, PV.W, PS, +; EG-NEXT: MOV T6.X, PV.Z, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: ALU clause starting at 160: +; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T2.X, PV.W, literal.x, -; EG-NEXT: MOV T7.W, T4.X, -; EG-NEXT: MOV * T1.W, T3.X, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LSHR T14.X, PV.W, literal.x, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, +; EG-NEXT: AND_INT * T1.W, T11.X, literal.z, +; EG-NEXT: 2(2.802597e-45), -65536(nan) +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T11.X, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.X, +; EG-NEXT: MOV * T12.W, T3.X, +; EG-NEXT: MOV T12.Y, T5.X, +; EG-NEXT: MOV T11.W, T7.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T11.Y, T9.X, ; ; CM-LABEL: v16i16_arg: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 0, @22, KC0[], KC1[] -; CM-NEXT: TEX 7 @6 -; CM-NEXT: ALU 11, @23, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T2.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T0.X +; CM-NEXT: ALU 1, @68, KC0[], KC1[] +; CM-NEXT: TEX 0 @36 +; CM-NEXT: ALU 5, @70, KC0[], KC1[] +; CM-NEXT: TEX 0 @38 +; CM-NEXT: ALU 5, @76, KC0[], KC1[] +; CM-NEXT: TEX 0 @40 +; CM-NEXT: ALU 5, @82, KC0[], KC1[] +; CM-NEXT: TEX 0 @42 +; CM-NEXT: ALU 5, @88, KC0[], KC1[] +; CM-NEXT: TEX 0 @44 +; CM-NEXT: ALU 5, @94, KC0[], KC1[] +; CM-NEXT: TEX 0 @46 +; CM-NEXT: ALU 5, @100, KC0[], KC1[] +; CM-NEXT: TEX 0 @48 +; CM-NEXT: ALU 5, @106, KC0[], KC1[] +; CM-NEXT: TEX 0 @50 +; CM-NEXT: ALU 5, @112, KC0[], KC1[] +; CM-NEXT: TEX 0 @52 +; CM-NEXT: ALU 5, @118, KC0[], KC1[] +; CM-NEXT: TEX 0 @54 +; CM-NEXT: ALU 5, @124, KC0[], KC1[] +; CM-NEXT: TEX 0 @56 +; CM-NEXT: ALU 5, @130, KC0[], KC1[] +; CM-NEXT: TEX 0 @58 +; CM-NEXT: ALU 5, @136, KC0[], KC1[] +; CM-NEXT: TEX 0 @60 +; CM-NEXT: ALU 5, @142, KC0[], KC1[] +; CM-NEXT: TEX 0 @62 +; CM-NEXT: ALU 5, @148, KC0[], KC1[] +; CM-NEXT: TEX 0 @64 +; CM-NEXT: ALU 5, @154, KC0[], KC1[] +; CM-NEXT: TEX 0 @66 +; CM-NEXT: ALU 14, @160, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T13.X ; CM-NEXT: CF_END -; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_16 T1.X, T0.X, 84, #3 -; CM-NEXT: VTX_READ_16 T2.X, T0.X, 86, #3 -; CM-NEXT: VTX_READ_16 T3.X, T0.X, 78, #3 -; CM-NEXT: VTX_READ_16 T4.X, T0.X, 94, #3 -; CM-NEXT: VTX_READ_16 T5.X, T0.X, 76, #3 -; CM-NEXT: VTX_READ_16 T6.X, T0.X, 92, #3 -; CM-NEXT: VTX_READ_16 T7.X, T0.X, 68, #3 -; CM-NEXT: VTX_READ_16 T0.X, T0.X, 70, #3 -; CM-NEXT: ALU clause starting at 22: -; CM-NEXT: MOV * T0.X, 0.0, -; CM-NEXT: ALU clause starting at 23: -; CM-NEXT: MOV * T1.Y, T2.X, -; CM-NEXT: MOV T7.Y, T0.X, -; CM-NEXT: MOV T1.Z, T6.X, BS:VEC_120/SCL_212 +; CM-NEXT: Fetch clause starting at 36: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 98, #3 +; CM-NEXT: Fetch clause starting at 38: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 90, #3 +; CM-NEXT: Fetch clause starting at 40: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 82, #3 +; CM-NEXT: Fetch clause starting at 42: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 74, #3 +; CM-NEXT: Fetch clause starting at 44: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 96, #3 +; CM-NEXT: Fetch clause starting at 46: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 88, #3 +; CM-NEXT: Fetch clause starting at 48: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 80, #3 +; CM-NEXT: Fetch clause starting at 50: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 72, #3 +; CM-NEXT: Fetch clause starting at 52: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 94, #3 +; CM-NEXT: Fetch clause starting at 54: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 86, #3 +; CM-NEXT: Fetch clause starting at 56: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 78, #3 +; CM-NEXT: Fetch clause starting at 58: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 70, #3 +; CM-NEXT: Fetch clause starting at 60: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 92, #3 +; CM-NEXT: Fetch clause starting at 62: +; CM-NEXT: VTX_READ_16 T12.X, T11.X, 84, #3 +; CM-NEXT: Fetch clause starting at 64: +; CM-NEXT: VTX_READ_16 T13.X, T11.X, 76, #3 +; CM-NEXT: Fetch clause starting at 66: +; CM-NEXT: VTX_READ_16 T11.X, T11.X, 68, #3 +; CM-NEXT: ALU clause starting at 68: +; CM-NEXT: MOV * T0.Y, T3.X, +; CM-NEXT: MOV * T11.X, 0.0, +; CM-NEXT: ALU clause starting at 70: +; CM-NEXT: LSHL T0.Z, T12.X, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV T3.X, PV.W, +; CM-NEXT: MOV * T0.Y, T5.X, +; CM-NEXT: ALU clause starting at 76: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T12.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T5.X, PV.W, +; CM-NEXT: MOV * T0.Y, T7.X, +; CM-NEXT: ALU clause starting at 82: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T12.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T7.X, PV.W, +; CM-NEXT: MOV * T0.Y, T9.X, +; CM-NEXT: ALU clause starting at 88: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T12.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T9.X, PV.W, +; CM-NEXT: MOV * T0.Y, T3.X, +; CM-NEXT: ALU clause starting at 94: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T3.X, PV.W, +; CM-NEXT: MOV * T0.Y, T5.X, +; CM-NEXT: ALU clause starting at 100: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T5.X, PV.W, +; CM-NEXT: MOV * T0.Y, T7.X, +; CM-NEXT: ALU clause starting at 106: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T7.X, PV.W, +; CM-NEXT: MOV * T0.Y, T9.X, +; CM-NEXT: ALU clause starting at 112: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T9.X, PV.W, +; CM-NEXT: MOV * T0.Y, T2.X, +; CM-NEXT: ALU clause starting at 118: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T12.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T2.X, PV.W, +; CM-NEXT: MOV * T0.Y, T4.X, +; CM-NEXT: ALU clause starting at 124: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T12.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV * T0.Y, T6.X, +; CM-NEXT: ALU clause starting at 130: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T12.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T6.X, PV.W, +; CM-NEXT: MOV * T0.Y, T8.X, +; CM-NEXT: ALU clause starting at 136: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T12.X, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T8.X, PV.W, +; CM-NEXT: MOV * T0.Y, T2.X, +; CM-NEXT: ALU clause starting at 142: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T12.Z, PV.Z, PV.W, +; CM-NEXT: MOV T2.X, PV.Z, +; CM-NEXT: MOV * T0.Y, T4.X, +; CM-NEXT: ALU clause starting at 148: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T12.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T12.X, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.X, +; CM-NEXT: MOV * T0.Y, T6.X, +; CM-NEXT: ALU clause starting at 154: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, T13.X, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T11.Z, PV.Z, PV.W, +; CM-NEXT: MOV T6.X, PV.Z, +; CM-NEXT: MOV * T0.Y, T8.X, +; CM-NEXT: ALU clause starting at 160: ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T0.X, PV.W, literal.x, -; CM-NEXT: MOV T7.Z, T5.X, -; CM-NEXT: MOV * T1.W, T4.X, BS:VEC_120/SCL_212 -; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LSHR T2.X, KC0[2].Y, literal.x, -; CM-NEXT: MOV * T7.W, T3.X, +; CM-NEXT: LSHR * T13.X, PV.W, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x, +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.y, +; CM-NEXT: AND_INT * T0.W, T11.X, literal.z, +; CM-NEXT: 2(2.802597e-45), -65536(nan) +; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; CM-NEXT: OR_INT * T11.X, PV.Z, PV.W, +; CM-NEXT: MOV T8.X, PV.X, +; CM-NEXT: MOV * T12.W, T3.X, +; CM-NEXT: MOV T12.Y, T5.X, +; CM-NEXT: MOV * T11.W, T7.X, BS:VEC_120/SCL_212 +; CM-NEXT: MOV * T11.Y, T9.X, entry: store <16 x i16> %in, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll index 4309cfbe..c29c52c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll @@ -11,6 +11,12 @@ declare <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 %src, i32 %scale, i declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> %src, i32 %scale, i32 %scale_sel) declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> %src, i32 %scale, i32 %scale_sel) declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 %src, i32 %scale, i32 %scale_sel) +declare <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> %src, i32 %scale, i32 %scale_sel) +declare <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> %src, i32 %scale, i32 %scale_sel) +declare <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> %src, i32 %scale, i32 %scale_sel) +declare <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> %src, i32 %scale, i32 %scale_sel) +declare <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.fp6(<3 x i32> %src, i32 %scale, i32 %scale_sel) +declare <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> %src, i32 %scale, i32 %scale_sel) define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) { ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_fp8_vv: @@ -162,3 +168,207 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr a store <8 x float> %cvt, ptr addrspace(1) %out, align 32 ret void } + +define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_fp6_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_fp6 v[6:13], v[0:2], v3 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_fp6_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_fp6 v[6:13], v[0:2], v3 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> %src, i32 %scale, i32 0) + store <16 x half> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_fp6_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v12, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_fp6 v[2:9], v[10:12], 0x64 scale_sel:1 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_fp6_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v10, s0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_fp6 v[2:9], v[10:12], 0x64 scale_sel:1 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.fp6(<3 x i32> %src, i32 100, i32 1) + store <16 x half> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_cvt_scale_pk16_bf16_fp6_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scale_pk16_bf16_fp6 v[6:13], v[0:2], v3 scale_sel:2 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> %src, i32 %scale, i32 2) + store <16 x bfloat> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_cvt_scale_pk16_bf16_fp6_sl: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1 +; GFX1250-NEXT: v_mov_b32_e32 v12, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_scale_pk16_bf16_fp6 v[2:9], v[10:12], 0x64 scale_sel:3 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.fp6(<3 x i32> %src, i32 100, i32 3) + store <16 x bfloat> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_bf6_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_bf6 v[6:13], v[0:2], v3 scale_sel:4 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_bf6_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_bf6 v[6:13], v[0:2], v3 scale_sel:4 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> %src, i32 %scale, i32 4) + store <16 x half> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_bf6_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v12, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f16_bf6 v[2:9], v[10:12], 0x64 scale_sel:5 +; GFX1250-SDAG-NEXT: s_clause 0x1 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_bf6_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v10, s0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f16_bf6 v[2:9], v[10:12], 0x64 scale_sel:5 +; GFX1250-GISEL-NEXT: s_clause 0x1 +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-GISEL-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <16 x half> @llvm.amdgcn.cvt.scale.pk16.f16.bf6(<3 x i32> %src, i32 100, i32 5) + store <16 x half> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_cvt_scale_pk16_bf16_bf6_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scale_pk16_bf16_bf6 v[6:13], v[0:2], v3 scale_sel:6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> %src, i32 %scale, i32 6) + store <16 x bfloat> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_cvt_scale_pk16_bf16_bf6_sl: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1 +; GFX1250-NEXT: v_mov_b32_e32 v12, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_scale_pk16_bf16_bf6 v[2:9], v[10:12], 0x64 scale_sel:7 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[0:1], v[6:9], off offset:16 +; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <16 x bfloat> @llvm.amdgcn.cvt.scale.pk16.bf16.bf6(<3 x i32> %src, i32 100, i32 7) + store <16 x bfloat> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk16_f32_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f32_fp6_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f32_fp6 v[6:21], v[0:2], v3 scale_sel:5 +; GFX1250-SDAG-NEXT: s_clause 0x3 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f32_fp6_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f32_fp6 v[6:21], v[0:2], v3 scale_sel:5 +; GFX1250-GISEL-NEXT: s_clause 0x3 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48 +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.fp6(<3 x i32> %src, i32 %scale, i32 5) + store <16 x float> %cvt, ptr addrspace(1) %out, align 16 + ret void +} + +define amdgpu_ps void @test_cvt_scale_pk16_f32_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f32_bf6_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scale_pk16_f32_bf6 v[6:21], v[0:2], v3 scale_sel:6 +; GFX1250-SDAG-NEXT: s_clause 0x3 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-SDAG-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f32_bf6_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scale_pk16_f32_bf6 v[6:21], v[0:2], v3 scale_sel:6 +; GFX1250-GISEL-NEXT: s_clause 0x3 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[14:17], off offset:32 +; GFX1250-GISEL-NEXT: global_store_b128 v[4:5], v[18:21], off offset:48 +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> %src, i32 %scale, i32 6) + store <16 x float> %cvt, ptr addrspace(1) %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll new file mode 100644 index 0000000..dfb9089 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll @@ -0,0 +1,303 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-GISEL %s + +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> %src, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> %src, float %scale) + +define amdgpu_ps void @test_scalef32_pk16_bf6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) { +; GFX1210-SDAG-LABEL: test_scalef32_pk16_bf6_f32_vv: +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v22, v17 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[0:15], v16 +; GFX1250-SDAG-NEXT: global_store_b96 v[22:23], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v22, v17 :: v_dual_mov_b32 v23, v18 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[0:15], v16 +; GFX1250-GISEL-NEXT: global_store_b96 v[22:23], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v15, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v17, s15 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[2:17], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[2:17], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f32(<16 x float> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v22, v17 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[0:15], v16 +; GFX1250-SDAG-NEXT: global_store_b96 v[22:23], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v22, v17 :: v_dual_mov_b32 v23, v18 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[0:15], v16 +; GFX1250-GISEL-NEXT: global_store_b96 v[22:23], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v15, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v17, s15 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[2:17], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[2:17], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f32(<16 x float> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_vv(<16 x bfloat> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.bf16(<16 x bfloat> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_f16_vv(<16 x half> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v14, v9 :: v_dual_mov_b32 v15, v10 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_bf6_f16_sl(<16 x half> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.bf6.f16(<16 x half> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_vv(<16 x bfloat> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.bf16(<16 x bfloat> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_f16_vv(<16 x half> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v14, v9 :: v_dual_mov_b32 v15, v10 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b96 v[14:15], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> %src, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk16_fp6_f16_sl(<16 x half> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.pk16.fp6.f16(<16 x half> %src, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll new file mode 100644 index 0000000..cd0b081 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll @@ -0,0 +1,403 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-GISEL %s + +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> %src, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> %src, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> %src, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> %src, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %src, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> %src, float %scale) +declare i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> %src, float %scale) +declare i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> %src, float %scale) +declare i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %src, float %scale) + +define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[8:9], v[0:3], v4 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[8:9], v[0:3], v4 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> %src, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[6:7], v[2:5], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[6:7], v[2:5], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> %src, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[8:9], v[0:3], v4 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[8:9], v[0:3], v4 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> %src, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[6:7], v[2:5], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[6:7], v[2:5], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> %src, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp8_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[8:9], v[0:3], v4 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[6:7], v[0:3], v4 +; GFX1250-GISEL-NEXT: global_store_b64 v[8:9], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> %src, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp8_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[6:7], v[2:5], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[6:7], v[2:5], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> %src, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_bf8_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[8:9], v[0:3], v4 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[6:7], v[0:3], v4 +; GFX1250-GISEL-NEXT: global_store_b64 v[8:9], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> %src, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_bf8_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[6:7], v[2:5], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[6:7], v[2:5], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> %src, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_bf8_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[12:13], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b64 v[12:13], v[10:11], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> %src, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_bf8_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> %src, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp8_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[12:13], v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b64 v[12:13], v[10:11], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %src, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp8_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %src, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp4_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f32 v9, v[0:7], v8 +; GFX1250-SDAG-NEXT: global_store_b32 v[10:11], v9, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f32 v9, v[0:7], v8 +; GFX1250-GISEL-NEXT: global_store_b32 v[12:13], v9, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> %src, float %scale) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp4_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f32 v10, v[2:9], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v10, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f32 v10, v[2:9], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v10, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> %src, float 100.0) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp4_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f16 v5, v[0:3], v4 +; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v5, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f16 v5, v[0:3], v4 +; GFX1250-GISEL-NEXT: global_store_b32 v[8:9], v5, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> %src, float %scale) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp4_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f16 v6, v[2:5], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f16 v6, v[2:5], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> %src, float 100.0) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v5, v[0:3], v4 +; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v5, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5 +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v5, v[0:3], v4 +; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v5, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %src, float %scale) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v6, v[2:5], 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v6, v[2:5], 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %src, float 100.0) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll new file mode 100644 index 0000000..d33acf6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll @@ -0,0 +1,385 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-GISEL %s + +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> %src, i32 %sr, float %scale) +declare <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> %src, i32 %sr, float %scale) +declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> %src, i32 %sr, float %scale) +declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> %src, i32 %sr, float %scale) +declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> %src, i32 %sr, float %scale) + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[8:9], v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[8:9], v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_bf16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.bf16(<8 x bfloat> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[8:9], v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[8:9], v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_bf16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.bf16(<8 x bfloat> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[8:9], v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[8:9], v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f16(<8 x half> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[8:9], v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[8:9], v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f16 v[6:7], v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f16(<8 x half> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[12:13], v[0:7], v8, v9 +; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[12:13], v[0:7], v8, v9 +; GFX1250-GISEL-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[2:9], s8, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_bf8_f32 v[10:11], v[2:9], s8, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.bf8.f32(<8 x float> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[12:13], v[0:7], v8, v9 +; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[12:13], v[0:7], v8, v9 +; GFX1250-GISEL-NEXT: global_store_b64 v[10:11], v[12:13], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> %src, i32 %sr, float %scale) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[2:9], s8, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp8_f32 v[10:11], v[2:9], s8, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[10:11], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk8.fp8.f32(<8 x float> %src, i32 %sr, float 100.0) + store <2 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f32_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v12, v[0:7], v8, v9 +; GFX1250-SDAG-NEXT: global_store_b32 v[10:11], v12, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f32_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v12, v[0:7], v8, v9 +; GFX1250-GISEL-NEXT: global_store_b32 v[10:11], v12, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> %src, i32 %sr, float %scale) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[2:9], s8, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v10, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f32 v10, v[2:9], s8, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v10, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f32(<8 x float> %src, i32 %sr, float 100.0) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v8, v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v8, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v8, v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v8, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> %src, i32 %sr, float %scale) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v6, v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_f16 v6, v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.f16(<8 x half> %src, i32 %sr, float 100.0) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_bf16_vv: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v8, v[0:3], v4, v5 +; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v8, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_bf16_vv: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v8, v[0:3], v4, v5 +; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v8, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> %src, i32 %sr, float %scale) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_bf16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v6, v[2:5], s4, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_bf16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk8_fp4_bf16 v6, v[2:5], s4, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> %src, i32 %sr, float 100.0) + store i32 %cvt, ptr addrspace(1) %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll new file mode 100644 index 0000000..c439518 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> %src, i32 %sr, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale) +declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %src, i32 %sr, float %scale) + +define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[12:14], v[0:7], v8, v9 +; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_sl: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[10:12], v[2:9], s8, 0x42c80000 +; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f16_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[12:14], v[0:7], v8, v9 +; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[12:14], v[0:7], v8, v9 +; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_sl: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[10:12], v[2:9], s8, 0x42c80000 +; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f16_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_f16 v[12:14], v[0:7], v8, v9 +; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f16_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk16_fp6_f16 v[10:12], v[2:9], s8, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_fp6_f16_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk16_fp6_f16 v[10:12], v[2:9], s8, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_vv(<16 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f32_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_f32 v[20:22], v[0:15], v16, v17 +; GFX1250-NEXT: global_store_b96 v[18:19], v[20:22], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> %src, i32 %sr, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_sl(<16 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v15, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v17, s15 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk16_bf6_f32 v[18:20], v[2:17], s16, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk16_bf6_f32 v[18:20], v[2:17], s16, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> %src, i32 %sr, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_vv(<16 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f32_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_f32 v[20:22], v[0:15], v16, v17 +; GFX1250-NEXT: global_store_b96 v[18:19], v[20:22], off +; GFX1250-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %src, i32 %sr, float %scale) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_sl(<16 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) { +; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f32_sl: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v11, s9 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v13, s11 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v15, s13 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v17, s15 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk16_fp6_f32 v[18:20], v[2:17], s16, 0x42c80000 +; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_fp6_f32_sl: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk16_fp6_f32 v[18:20], v[2:17], s16, 0x42c80000 +; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[18:20], off +; GFX1250-GISEL-NEXT: s_endpgm + %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %src, i32 %sr, float 100.0) + store <3 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll new file mode 100644 index 0000000..d2f96c4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.perm.pk.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +declare <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32, i32, <2 x i32>) +declare <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32, i64, <2 x i32>) +declare <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64, i64, <2 x i32>) + +define void @test_perm_pk16_b4_u4(i32 %a, i32 %b, <2 x i32> %c, ptr %out) { +; GFX1250-LABEL: test_perm_pk16_b4_u4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_perm_pk16_b4_u4 v[0:1], v0, v1, v[2:3] +; GFX1250-NEXT: flat_store_b64 v[4:5], v[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <2 x i32> @llvm.amdgcn.perm.pk16.b4.u4(i32 %a, i32 %b, <2 x i32> %c) + store <2 x i32> %ret, ptr %out, align 8 + ret void +} + +define void @test_perm_pk16_b6_u4(i32 %a, i64 %b, <2 x i32> %c, ptr %out) { +; GFX1250-SDAG-LABEL: test_perm_pk16_b6_u4: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v4 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v6, v5 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_perm_pk16_b6_u4 v[0:2], v0, v[2:3], v[8:9] +; GFX1250-SDAG-NEXT: flat_store_b96 v[6:7], v[0:2] scope:SCOPE_SE +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: test_perm_pk16_b6_u4: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, v3 :: v_dual_mov_b32 v3, v4 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_perm_pk16_b6_u4 v[0:2], v0, v[8:9], v[2:3] +; GFX1250-GISEL-NEXT: flat_store_b96 v[4:5], v[0:2] scope:SCOPE_SE +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <3 x i32> @llvm.amdgcn.perm.pk16.b6.u4(i32 %a, i64 %b, <2 x i32> %c) + store <3 x i32> %ret, ptr %out, align 16 + ret void +} + +define void @test_perm_pk16_b8_u4(i64 %a, i64 %b, <2 x i32> %c, ptr %out) { +; GFX1250-LABEL: test_perm_pk16_b8_u4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_perm_pk16_b8_u4 v[0:3], v[0:1], v[2:3], v[4:5] +; GFX1250-NEXT: flat_store_b128 v[6:7], v[0:3] scope:SCOPE_SE +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %ret = tail call <4 x i32> @llvm.amdgcn.perm.pk16.b8.u4(i64 %a, i64 %b, <2 x i32> %c) + store <4 x i32> %ret, ptr %out, align 16 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll index 9149ed5..1bf865c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll @@ -854,6 +854,1202 @@ bb: ret void } +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[42:43], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off +; GISEL-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ss(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ss: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ss: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 2, i32 1, i32 %scale_src0, i32 1, i32 2, i32 %scale_src1, i1 true, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_movk_i32 s1, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32i.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 3, i32 2, i32 %scale_src0, i32 0, i32 1, i32 100, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[42:43], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off +; GISEL-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[38:39], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off +; GISEL-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[38:39], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off +; GISEL-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[42:43], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off +; GISEL-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[42:43], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off +; GISEL-NEXT: global_store_b128 v[42:43], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[38:39], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off +; GISEL-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[38:39], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off +; GISEL-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 1, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[38:39], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off +; GISEL-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[38:39], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off +; GISEL-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[30:31], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[30:31], v[20:23], off +; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 2, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[38:39], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off +; GISEL-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[38:39], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[38:39], v[28:31], off +; GISEL-NEXT: global_store_b128 v[38:39], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[30:31], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[30:31], v[20:23], off +; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[30:31], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[30:31], v[20:23], off +; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[30:31], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[30:31], v[20:23], off +; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v24, v25 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[26:27], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v24, v25 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[26:27], v[16:19], off +; GISEL-NEXT: global_store_b128 v[26:27], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v8i32.v8i32(i32 4, <8 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i32 %scale_src0, i32 0, i32 0, i32 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[44:45], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off +; GISEL-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32i.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ss(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ss: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ss: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32i.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 2, i32 1, i64 %scale_src0, i32 1, i32 2, i64 %scale_src1, i1 true, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32i.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C, i32 3, i32 2, i64 %scale_src0, i32 0, i32 1, i64 100, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[44:45], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off +; GISEL-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[44:45], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off +; GISEL-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[44:45], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off +; GISEL-NEXT: global_store_b128 v[44:45], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 1, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[20:23], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 2, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[20:23], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[20:23], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[20:23], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v[24:25], v[26:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v[24:25], v[26:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[16:19], off +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v8i32.v8i32(i32 4, <8 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C, i32 0, i32 0, i64 %scale_src0, i32 0, i32 0, i64 %scale_src1, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb @@ -1040,6 +2236,170 @@ bb: ret void } +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[42:43], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[42:43], v[24:27], off +; GISEL-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ss: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ss: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 2, i32 1, i32 %scale_src0, i32 1, i32 2, i32 %scale_src1, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_movk_i32 s1, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 3, i32 2, i32 %scale_src0, i32 0, i32 1, i32 100, i1 false, i1 true) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[44:45], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[44:45], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[44:45], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[44:45], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[44:45], v[24:27], off +; GISEL-NEXT: global_store_b128 v[44:45], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[44:45], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[44:45], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 2, i32 1, i64 %scale_src0, i32 1, i32 2, i64 %scale_src1, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 3, i32 2, i64 %scale_src0, i32 0, i32 1, i64 100, i1 false, i1 true) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16: ; GFX1250: ; %bb.0: ; %bb @@ -1366,6 +2726,8 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) @@ -1375,6 +2737,8 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) +declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll index 12ea314..48303c0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -1446,6 +1446,220 @@ bb: ret void } +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 true, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34 +; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 +; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s1, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 1, i32 1, i32 0, i32 2, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 +; GFX1250-NEXT: s_movk_i32 s0, 0x65 +; GFX1250-NEXT: s_movk_i32 s1, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 +; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 +; GFX1250-NEXT: v_mov_b32_e32 v41, v34 +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v43, 0x65 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v42, v43 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i32 100, i32 1, i32 0, i32 101, i1 true, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 true, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34 +; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 +; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s1, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 1, i32 1, i32 0, i64 2, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 +; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65 +; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 +; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 +; GFX1250-NEXT: v_mov_b32_e32 v41, v34 +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x65 +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41], v[42:43], v[44:45] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i64 100, i32 1, i32 0, i64 101, i1 true, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb @@ -2316,6 +2530,312 @@ bb: ret void } +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26 +; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26 +; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26 +; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26 +; GFX1250-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26 +; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26 +; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s1, 2.0 +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s0 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i32 1, i32 1, i32 0, i32 2, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 +; GFX1250-NEXT: s_movk_i32 s0, 0x65 +; GFX1250-NEXT: s_movk_i32 s1, 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26 +; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26 +; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26 +; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26 +; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26 +; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26 +; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26 +; GFX1250-NEXT: v_mov_b32_e32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s1, s0 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: v_mov_b32_e32 v42, 0x64 +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s0 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v43, 0x65 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v42, v43 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i32 100, i32 1, i32 0, i32 101, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 %scale_src0, i32 1, i32 0, i64 %scale_src1, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26 +; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26 +; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26 +; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26 +; GFX1250-NEXT: v_dual_mov_b32 v36, v26 :: v_dual_mov_b32 v37, v26 +; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26 +; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s1, 2.0 +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s0 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], 1, 2 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, i32 1, i32 0, i64 1, i32 1, i32 0, i64 2, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 +; GFX1250-NEXT: s_mov_b64 s[0:1], 0x65 +; GFX1250-NEXT: s_mov_b64 s[2:3], 0x64 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26 +; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26 +; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26 +; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26 +; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26 +; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26 +; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26 +; GFX1250-NEXT: v_mov_b32_e32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], s[2:3], s[0:1] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: v_mov_b64_e32 v[42:43], 0x64 +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s0 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0x65 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41], v[42:43], v[44:45] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>, i32 1, i32 0, i64 100, i32 1, i32 0, i64 101, i1 true, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1) @@ -2332,6 +2852,8 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) @@ -2341,3 +2863,5 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) +declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll index bf8308b..8f674f8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll @@ -1192,6 +1192,138 @@ bb: ret void } +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 1, <8 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 3, <8 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 4, <8 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 1, <8 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 3, <8 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 4, <8 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb @@ -1750,6 +1882,162 @@ bb: ret void } +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C, i32 1, i32 0, i32 2, i32 1, i32 0, i32 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C, i32 1, i32 0, i64 2, i32 1, i32 0, i64 4, i1 false, i1 false) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negA: ; GFX1250: ; %bb.0: ; %bb @@ -2034,6 +2322,8 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) +declare <8 x float> @llvm.amdgcn.wmma.scale.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.scale16.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>, i32, i32, i64, i32, i32, i64, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) @@ -2043,6 +2333,8 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) +declare <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32>, <8 x i32>, i16, <16 x float>, i32, i32, i64, i32, i32, i64, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 0e66b0a..22f562a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -784,13 +784,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v5, 0x7fff ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v5 ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v2, 0xffff8000, v2, v5 ; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v3, v5 -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v5 ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v0.h, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32: @@ -910,9 +910,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v3f16_v3i16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l ; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v0.h, v2.h +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i16: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index bfc01ef..d59f72a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -8343,53 +8343,53 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshr_b32 s42, s5, 30 -; GFX6-NEXT: s_lshr_b32 s36, s5, 28 -; GFX6-NEXT: s_lshr_b32 s38, s5, 29 -; GFX6-NEXT: s_lshr_b32 s30, s5, 26 -; GFX6-NEXT: s_lshr_b32 s34, s5, 27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 24 -; GFX6-NEXT: s_lshr_b32 s28, s5, 25 -; GFX6-NEXT: s_lshr_b32 s22, s5, 22 -; GFX6-NEXT: s_lshr_b32 s24, s5, 23 -; GFX6-NEXT: s_lshr_b32 s18, s5, 20 -; GFX6-NEXT: s_lshr_b32 s20, s5, 21 -; GFX6-NEXT: s_lshr_b32 s14, s5, 18 -; GFX6-NEXT: s_lshr_b32 s16, s5, 19 -; GFX6-NEXT: s_lshr_b32 s10, s5, 16 -; GFX6-NEXT: s_lshr_b32 s12, s5, 17 -; GFX6-NEXT: s_lshr_b32 s6, s5, 14 -; GFX6-NEXT: s_lshr_b32 s8, s5, 15 -; GFX6-NEXT: s_mov_b32 s40, s5 +; GFX6-NEXT: s_lshr_b32 s36, s4, 30 +; GFX6-NEXT: s_lshr_b32 s38, s4, 31 +; GFX6-NEXT: s_lshr_b32 s30, s4, 28 +; GFX6-NEXT: s_lshr_b32 s34, s4, 29 +; GFX6-NEXT: s_lshr_b32 s26, s4, 26 +; GFX6-NEXT: s_lshr_b32 s28, s4, 27 +; GFX6-NEXT: s_lshr_b32 s22, s4, 24 +; GFX6-NEXT: s_lshr_b32 s24, s4, 25 +; GFX6-NEXT: s_lshr_b32 s18, s4, 22 +; GFX6-NEXT: s_lshr_b32 s20, s4, 23 +; GFX6-NEXT: s_lshr_b32 s14, s4, 20 +; GFX6-NEXT: s_lshr_b32 s16, s4, 21 +; GFX6-NEXT: s_lshr_b32 s10, s4, 18 +; GFX6-NEXT: s_lshr_b32 s12, s4, 19 +; GFX6-NEXT: s_lshr_b32 s6, s4, 16 +; GFX6-NEXT: s_lshr_b32 s8, s4, 17 ; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v4, s7 -; GFX6-NEXT: s_lshr_b32 s40, s5, 12 +; GFX6-NEXT: s_lshr_b32 s40, s4, 14 ; GFX6-NEXT: v_mov_b32_e32 v0, s44 ; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[4:5], 0x10000 +; GFX6-NEXT: s_mov_b32 s44, s5 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v6, s44 ; GFX6-NEXT: v_mov_b32_e32 v7, s45 -; GFX6-NEXT: s_lshr_b32 s44, s5, 13 +; GFX6-NEXT: s_lshr_b32 s44, s4, 15 ; GFX6-NEXT: v_mov_b32_e32 v2, s42 ; GFX6-NEXT: v_mov_b32_e32 v3, s43 -; GFX6-NEXT: s_lshr_b32 s42, s5, 10 +; GFX6-NEXT: s_lshr_b32 s42, s4, 12 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v8, s36 ; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s5, 11 +; GFX6-NEXT: s_lshr_b32 s36, s4, 13 ; GFX6-NEXT: v_mov_b32_e32 v10, s38 ; GFX6-NEXT: v_mov_b32_e32 v11, s39 -; GFX6-NEXT: s_lshr_b32 s38, s5, 8 +; GFX6-NEXT: s_lshr_b32 s38, s4, 10 ; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v12, s30 ; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s5, 9 +; GFX6-NEXT: s_lshr_b32 s30, s4, 11 ; GFX6-NEXT: v_mov_b32_e32 v14, s34 ; GFX6-NEXT: v_mov_b32_e32 v15, s35 -; GFX6-NEXT: s_lshr_b32 s34, s5, 6 +; GFX6-NEXT: s_lshr_b32 s34, s4, 8 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 ; GFX6-NEXT: v_mov_b32_e32 v5, s7 @@ -8397,190 +8397,191 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: s_lshr_b32 s26, s5, 7 +; GFX6-NEXT: s_lshr_b32 s26, s4, 9 ; GFX6-NEXT: v_mov_b32_e32 v4, s28 ; GFX6-NEXT: v_mov_b32_e32 v5, s29 -; GFX6-NEXT: s_lshr_b32 s28, s5, 4 +; GFX6-NEXT: s_lshr_b32 s28, s4, 6 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s22 ; GFX6-NEXT: v_mov_b32_e32 v9, s23 -; GFX6-NEXT: s_lshr_b32 s22, s5, 5 +; GFX6-NEXT: s_lshr_b32 s22, s4, 7 ; GFX6-NEXT: v_mov_b32_e32 v10, s24 ; GFX6-NEXT: v_mov_b32_e32 v11, s25 -; GFX6-NEXT: s_lshr_b32 s24, s5, 2 +; GFX6-NEXT: s_lshr_b32 s24, s4, 4 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:224 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s18 ; GFX6-NEXT: v_mov_b32_e32 v13, s19 -; GFX6-NEXT: s_lshr_b32 s18, s5, 3 +; GFX6-NEXT: s_lshr_b32 s18, s4, 5 ; GFX6-NEXT: v_mov_b32_e32 v14, s20 ; GFX6-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NEXT: s_lshr_b32 s20, s5, 1 +; GFX6-NEXT: s_lshr_b32 s20, s4, 2 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:448 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:208 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 30 +; GFX6-NEXT: s_lshr_b32 s14, s4, 3 ; GFX6-NEXT: v_mov_b32_e32 v4, s16 ; GFX6-NEXT: v_mov_b32_e32 v5, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 31 +; GFX6-NEXT: s_lshr_b32 s16, s4, 1 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:432 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:192 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s10 ; GFX6-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NEXT: s_lshr_b32 s10, s4, 28 +; GFX6-NEXT: s_lshr_b32 s10, s5, 29 ; GFX6-NEXT: v_mov_b32_e32 v10, s12 ; GFX6-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NEXT: s_lshr_b32 s12, s4, 29 +; GFX6-NEXT: s_lshr_b32 s12, s5, 28 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:416 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s6 ; GFX6-NEXT: v_mov_b32_e32 v13, s7 -; GFX6-NEXT: s_lshr_b32 s46, s4, 26 +; GFX6-NEXT: s_lshr_b32 s6, s5, 26 ; GFX6-NEXT: v_mov_b32_e32 v14, s8 ; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 27 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[44:45], 0x10000 +; GFX6-NEXT: s_lshr_b32 s8, s5, 27 +; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:400 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:160 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v2, s40 ; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 24 -; GFX6-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NEXT: s_lshr_b32 s44, s4, 25 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[36:37], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:384 +; GFX6-NEXT: s_lshr_b32 s40, s5, 25 +; GFX6-NEXT: v_mov_b32_e32 v4, s44 +; GFX6-NEXT: v_mov_b32_e32 v5, s45 +; GFX6-NEXT: s_lshr_b32 s44, s5, 24 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 22 -; GFX6-NEXT: v_mov_b32_e32 v10, s6 -; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: s_lshr_b32 s42, s4, 23 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[38:39], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:368 +; GFX6-NEXT: v_mov_b32_e32 v8, s42 +; GFX6-NEXT: v_mov_b32_e32 v9, s43 +; GFX6-NEXT: s_lshr_b32 s42, s5, 22 +; GFX6-NEXT: v_mov_b32_e32 v10, s36 +; GFX6-NEXT: v_mov_b32_e32 v11, s37 +; GFX6-NEXT: s_lshr_b32 s36, s5, 23 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s30 -; GFX6-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 20 -; GFX6-NEXT: v_mov_b32_e32 v14, s6 -; GFX6-NEXT: v_mov_b32_e32 v15, s7 -; GFX6-NEXT: s_lshr_b32 s6, s4, 21 +; GFX6-NEXT: v_mov_b32_e32 v12, s38 +; GFX6-NEXT: v_mov_b32_e32 v13, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 20 +; GFX6-NEXT: v_mov_b32_e32 v14, s30 +; GFX6-NEXT: v_mov_b32_e32 v15, s31 +; GFX6-NEXT: s_lshr_b32 s4, s5, 21 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:352 -; GFX6-NEXT: v_mov_b32_e32 v16, s34 -; GFX6-NEXT: v_mov_b32_e32 v17, s35 -; GFX6-NEXT: s_lshr_b32 s34, s4, 18 -; GFX6-NEXT: v_mov_b32_e32 v18, s26 -; GFX6-NEXT: v_mov_b32_e32 v19, s27 -; GFX6-NEXT: s_lshr_b32 s26, s4, 19 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[34:35], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:112 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s30 +; GFX6-NEXT: v_mov_b32_e32 v3, s31 +; GFX6-NEXT: s_lshr_b32 s30, s5, 18 +; GFX6-NEXT: v_mov_b32_e32 v4, s26 +; GFX6-NEXT: v_mov_b32_e32 v5, s27 +; GFX6-NEXT: s_lshr_b32 s26, s5, 19 ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:336 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v8, s28 ; GFX6-NEXT: v_mov_b32_e32 v9, s29 -; GFX6-NEXT: s_lshr_b32 s28, s4, 16 +; GFX6-NEXT: s_lshr_b32 s28, s5, 17 ; GFX6-NEXT: v_mov_b32_e32 v10, s22 ; GFX6-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NEXT: s_lshr_b32 s22, s4, 17 +; GFX6-NEXT: s_lshr_b32 s22, s5, 16 +; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:320 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v12, s24 ; GFX6-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NEXT: s_lshr_b32 s24, s4, 14 -; GFX6-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: s_lshr_b32 s24, s5, 14 ; GFX6-NEXT: v_mov_b32_e32 v14, s18 ; GFX6-NEXT: v_mov_b32_e32 v15, s19 -; GFX6-NEXT: s_lshr_b32 s18, s4, 15 -; GFX6-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NEXT: v_mov_b32_e32 v3, s21 -; GFX6-NEXT: s_lshr_b32 s20, s4, 12 -; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: s_lshr_b32 s18, s5, 15 +; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:64 +; GFX6-NEXT: v_mov_b32_e32 v16, s20 +; GFX6-NEXT: v_mov_b32_e32 v17, s21 +; GFX6-NEXT: s_lshr_b32 s20, s5, 12 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:304 +; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v18, s14 +; GFX6-NEXT: v_mov_b32_e32 v19, s15 +; GFX6-NEXT: s_lshr_b32 s14, s5, 13 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s14 -; GFX6-NEXT: v_mov_b32_e32 v17, s15 -; GFX6-NEXT: s_lshr_b32 s14, s4, 13 -; GFX6-NEXT: v_mov_b32_e32 v18, s16 -; GFX6-NEXT: v_mov_b32_e32 v19, s17 -; GFX6-NEXT: s_lshr_b32 s16, s4, 10 +; GFX6-NEXT: v_mov_b32_e32 v2, s16 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: s_lshr_b32 s16, s5, 10 ; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:288 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s10 -; GFX6-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NEXT: s_lshr_b32 s10, s4, 11 -; GFX6-NEXT: v_mov_b32_e32 v10, s12 -; GFX6-NEXT: v_mov_b32_e32 v11, s13 -; GFX6-NEXT: s_lshr_b32 s12, s4, 8 +; GFX6-NEXT: v_mov_b32_e32 v8, s12 +; GFX6-NEXT: v_mov_b32_e32 v9, s13 +; GFX6-NEXT: s_lshr_b32 s12, s5, 11 +; GFX6-NEXT: v_mov_b32_e32 v10, s10 +; GFX6-NEXT: v_mov_b32_e32 v11, s11 +; GFX6-NEXT: s_lshr_b32 s10, s5, 8 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[46:47], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:272 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v12, s38 -; GFX6-NEXT: v_mov_b32_e32 v13, s39 -; GFX6-NEXT: s_lshr_b32 s38, s4, 9 +; GFX6-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NEXT: s_lshr_b32 s6, s5, 9 ; GFX6-NEXT: v_mov_b32_e32 v14, s8 ; GFX6-NEXT: v_mov_b32_e32 v15, s9 -; GFX6-NEXT: s_lshr_b32 s8, s4, 6 -; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_lshr_b32 s8, s5, 6 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[44:45], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:256 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s40 -; GFX6-NEXT: v_mov_b32_e32 v1, s41 -; GFX6-NEXT: s_lshr_b32 s40, s4, 7 -; GFX6-NEXT: v_mov_b32_e32 v2, s44 -; GFX6-NEXT: v_mov_b32_e32 v3, s45 -; GFX6-NEXT: s_lshr_b32 s44, s4, 4 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v16, s34 +; GFX6-NEXT: v_mov_b32_e32 v17, s35 +; GFX6-NEXT: s_lshr_b32 s34, s5, 7 +; GFX6-NEXT: v_mov_b32_e32 v18, s40 +; GFX6-NEXT: v_mov_b32_e32 v19, s41 +; GFX6-NEXT: s_lshr_b32 s40, s5, 4 ; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v16, s36 -; GFX6-NEXT: v_mov_b32_e32 v17, s37 -; GFX6-NEXT: s_lshr_b32 s36, s4, 5 -; GFX6-NEXT: v_mov_b32_e32 v18, s42 -; GFX6-NEXT: v_mov_b32_e32 v19, s43 -; GFX6-NEXT: s_lshr_b32 s42, s4, 2 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:224 +; GFX6-NEXT: v_mov_b32_e32 v0, s42 +; GFX6-NEXT: v_mov_b32_e32 v1, s43 +; GFX6-NEXT: s_lshr_b32 s42, s5, 5 +; GFX6-NEXT: v_mov_b32_e32 v2, s36 +; GFX6-NEXT: v_mov_b32_e32 v3, s37 +; GFX6-NEXT: s_lshr_b32 s36, s5, 2 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:480 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v8, s30 -; GFX6-NEXT: v_mov_b32_e32 v9, s31 -; GFX6-NEXT: s_lshr_b32 s30, s4, 3 -; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NEXT: v_mov_b32_e32 v9, s39 +; GFX6-NEXT: s_lshr_b32 s38, s5, 3 +; GFX6-NEXT: s_lshr_b32 s44, s5, 1 ; GFX6-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000 @@ -8589,71 +8590,71 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX6-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000 ; GFX6-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 -; GFX6-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000 -; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NEXT: v_mov_b32_e32 v10, s6 -; GFX6-NEXT: v_mov_b32_e32 v11, s7 -; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GFX6-NEXT: s_waitcnt expcnt(2) -; GFX6-NEXT: v_mov_b32_e32 v0, s34 -; GFX6-NEXT: v_mov_b32_e32 v1, s35 +; GFX6-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000 +; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000 +; GFX6-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:464 +; GFX6-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:448 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:432 +; GFX6-NEXT: v_mov_b32_e32 v10, s4 +; GFX6-NEXT: v_mov_b32_e32 v11, s5 +; GFX6-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:416 +; GFX6-NEXT: s_waitcnt expcnt(1) +; GFX6-NEXT: v_mov_b32_e32 v0, s30 +; GFX6-NEXT: v_mov_b32_e32 v1, s31 ; GFX6-NEXT: v_mov_b32_e32 v2, s26 ; GFX6-NEXT: v_mov_b32_e32 v3, s27 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:400 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s28 -; GFX6-NEXT: v_mov_b32_e32 v1, s29 -; GFX6-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NEXT: v_mov_b32_e32 v3, s23 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GFX6-NEXT: v_mov_b32_e32 v0, s22 +; GFX6-NEXT: v_mov_b32_e32 v1, s23 +; GFX6-NEXT: v_mov_b32_e32 v2, s28 +; GFX6-NEXT: v_mov_b32_e32 v3, s29 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:384 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s24 ; GFX6-NEXT: v_mov_b32_e32 v1, s25 ; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:368 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s20 ; GFX6-NEXT: v_mov_b32_e32 v1, s21 ; GFX6-NEXT: v_mov_b32_e32 v2, s14 ; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:352 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s16 ; GFX6-NEXT: v_mov_b32_e32 v1, s17 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: v_mov_b32_e32 v3, s11 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_mov_b32_e32 v3, s13 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:336 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s6 +; GFX6-NEXT: v_mov_b32_e32 v3, s7 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:320 ; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s40 -; GFX6-NEXT: v_mov_b32_e32 v3, s41 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GFX6-NEXT: v_mov_b32_e32 v2, s34 +; GFX6-NEXT: v_mov_b32_e32 v3, s35 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:304 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NEXT: v_mov_b32_e32 v2, s36 -; GFX6-NEXT: v_mov_b32_e32 v3, s37 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GFX6-NEXT: v_mov_b32_e32 v0, s40 +; GFX6-NEXT: v_mov_b32_e32 v1, s41 +; GFX6-NEXT: v_mov_b32_e32 v2, s42 +; GFX6-NEXT: v_mov_b32_e32 v3, s43 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:288 ; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s42 -; GFX6-NEXT: v_mov_b32_e32 v1, s43 -; GFX6-NEXT: v_mov_b32_e32 v2, s30 -; GFX6-NEXT: v_mov_b32_e32 v3, s31 -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NEXT: v_mov_b32_e32 v8, s4 -; GFX6-NEXT: v_mov_b32_e32 v9, s5 -; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NEXT: v_mov_b32_e32 v2, s38 +; GFX6-NEXT: v_mov_b32_e32 v3, s39 +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:272 +; GFX6-NEXT: v_mov_b32_e32 v8, s44 +; GFX6-NEXT: v_mov_b32_e32 v9, s45 +; GFX6-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:256 ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 4491c4b..a135b43 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -232,32 +232,38 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp ; ; EG-LABEL: constant_load_v3i16: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 -; EG-NEXT: MEM_RAT MSKOR T2.XW, T0.X +; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 2 @6 +; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 +; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T1.X, T0.X, 0, #1 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1 +; EG-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1 +; EG-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1 +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 13: ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, +; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) ; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T2.X, T2.W, PV.W, -; EG-NEXT: LSHL * T2.W, literal.x, PV.W, +; EG-NEXT: LSHL T5.X, T2.W, PV.W, +; EG-NEXT: LSHL * T5.W, literal.x, PV.W, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: MOV T5.Y, 0.0, +; EG-NEXT: MOV * T5.Z, 0.0, +; EG-NEXT: LSHR T8.X, T0.W, literal.x, +; EG-NEXT: LSHL T0.W, T7.X, literal.y, +; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, +; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MOV T2.Y, 0.0, -; EG-NEXT: MOV * T2.Z, 0.0, -; EG-NEXT: LSHR T0.X, T0.W, literal.x, -; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT T6.X, PV.W, PS, +; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; GFX12-LABEL: constant_load_v3i16: @@ -1643,15 +1649,15 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s4, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s4, 16 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s5, s5 ; GCN-NOHSA-SI-NEXT: s_sext_i32_i16 s4, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1666,14 +1672,14 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_ashr_i32 s0, s2, 16 -; GCN-HSA-NEXT: s_ashr_i32 s1, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s0, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s1, s2, 16 ; GCN-HSA-NEXT: s_sext_i32_i16 s3, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -6539,33 +6545,33 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s5 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s13, s5, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s5, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s7, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 @@ -6586,8 +6592,8 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_mov_b32 s2, s7 -; GCN-HSA-NEXT: s_mov_b32 s8, s5 -; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s6, 16 +; GCN-HSA-NEXT: s_mov_b32 s10, s5 ; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 ; GCN-HSA-NEXT: s_ashr_i32 s13, s5, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[4:5], 0x100000 @@ -6605,25 +6611,25 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -7161,12 +7167,12 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, -1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s7 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s16, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s3 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s1 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x100000 @@ -7174,60 +7180,60 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x100000 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s1, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[16:17], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s40, s7, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[0:1], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s3, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s20, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s7, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[24:25], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s20 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[8:11], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s30 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s1 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s28 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s12 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s13 @@ -7243,19 +7249,19 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s12, s7 +; GCN-HSA-NEXT: s_mov_b32 s10, s7 +; GCN-HSA-NEXT: s_lshr_b32 s12, s6, 16 ; GCN-HSA-NEXT: s_mov_b32 s14, s5 -; GCN-HSA-NEXT: s_mov_b32 s16, s3 -; GCN-HSA-NEXT: s_mov_b32 s18, s1 -; GCN-HSA-NEXT: s_ashr_i32 s27, s1, 31 +; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 16 +; GCN-HSA-NEXT: s_ashr_i32 s25, s1, 31 ; GCN-HSA-NEXT: s_ashr_i32 s29, s3, 31 ; GCN-HSA-NEXT: s_ashr_i32 s30, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s24, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s26, s0, 16 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_mov_b32 s18, s3 +; GCN-HSA-NEXT: s_lshr_b32 s20, s2, 16 +; GCN-HSA-NEXT: s_mov_b32 s22, s1 +; GCN-HSA-NEXT: s_lshr_b32 s24, s0, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[2:3], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s28, s1, 16 ; GCN-HSA-NEXT: s_ashr_i32 s31, s5, 31 ; GCN-HSA-NEXT: s_ashr_i32 s33, s5, 16 @@ -7266,55 +7272,36 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[24:25], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[22:23], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000 -; GCN-HSA-NEXT: s_add_u32 s24, s8, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s8, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_add_u32 s22, s8, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s23, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s34 -; GCN-HSA-NEXT: s_add_u32 s14, s8, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31 +; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: s_add_u32 s14, s8, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s15, s9, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s8, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -7323,17 +7310,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) % ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s8, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s8, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s9, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 @@ -8307,148 +8312,151 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, s15 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, s13 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s11 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s7 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s1, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s1, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s3, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s3, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s5, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s5, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s9, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s9, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s55, s11, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s11, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s11 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s24, s9 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s23, s1, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s25, s1, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s3, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s3, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s37, s7, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s9, 31 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[24:25], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[20:21], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[18:19], 0x100000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s13, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s59, s13, 16 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s60, s15, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s61, s15, 16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s54, s5 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s3 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s56, s1 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s0, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[22:23], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[20:21], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[54:55], s[18:19], 0x100000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s43, s9, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s11, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s47, s11, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s13, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s56, s13, 16 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s57, s15, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s58, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s8, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s46, s7 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s6, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s44, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s4, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s3 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s2, 16 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s0, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[14:15], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s53 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s55 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s3 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s57 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[22:23], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[56:57], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[42:43], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[40:41], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[38:39], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:176 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s41 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[46:47], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[44:45], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[42:43], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s39 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s35 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s14 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[38:39], 0x100000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s27 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s10 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[36:37], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[34:35], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[30:31], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[28:29], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x100000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x100000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s7 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, s21 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s35 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64: @@ -8460,47 +8468,47 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_mov_b32 s34, s15 -; GCN-HSA-NEXT: s_ashr_i32 s41, s3, 31 -; GCN-HSA-NEXT: s_ashr_i32 s42, s3, 16 -; GCN-HSA-NEXT: s_ashr_i32 s57, s5, 16 -; GCN-HSA-NEXT: s_ashr_i32 s59, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s61, s7, 16 -; GCN-HSA-NEXT: s_ashr_i32 s63, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s65, s9, 16 -; GCN-HSA-NEXT: s_ashr_i32 s67, s11, 31 -; GCN-HSA-NEXT: s_ashr_i32 s69, s11, 16 -; GCN-HSA-NEXT: s_mov_b32 s44, s13 -; GCN-HSA-NEXT: s_mov_b32 s46, s11 -; GCN-HSA-NEXT: s_mov_b32 s48, s9 -; GCN-HSA-NEXT: s_mov_b32 s50, s7 -; GCN-HSA-NEXT: s_mov_b32 s52, s5 -; GCN-HSA-NEXT: s_mov_b32 s38, s3 -; GCN-HSA-NEXT: s_mov_b32 s36, s1 -; GCN-HSA-NEXT: s_lshr_b32 s54, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s56, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s58, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s60, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s62, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s64, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s66, s2, 16 +; GCN-HSA-NEXT: s_mov_b32 s24, s15 +; GCN-HSA-NEXT: s_ashr_i32 s37, s3, 31 +; GCN-HSA-NEXT: s_ashr_i32 s38, s3, 16 +; GCN-HSA-NEXT: s_ashr_i32 s57, s11, 16 +; GCN-HSA-NEXT: s_ashr_i32 s59, s13, 31 +; GCN-HSA-NEXT: s_ashr_i32 s61, s13, 16 +; GCN-HSA-NEXT: s_ashr_i32 s63, s15, 31 +; GCN-HSA-NEXT: s_ashr_i32 s65, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s46, s14, 16 +; GCN-HSA-NEXT: s_mov_b32 s48, s13 +; GCN-HSA-NEXT: s_lshr_b32 s50, s12, 16 +; GCN-HSA-NEXT: s_mov_b32 s52, s11 +; GCN-HSA-NEXT: s_lshr_b32 s34, s10, 16 +; GCN-HSA-NEXT: s_mov_b32 s30, s9 +; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 16 +; GCN-HSA-NEXT: s_mov_b32 s54, s7 +; GCN-HSA-NEXT: s_lshr_b32 s56, s6, 16 +; GCN-HSA-NEXT: s_mov_b32 s58, s5 +; GCN-HSA-NEXT: s_lshr_b32 s60, s4, 16 +; GCN-HSA-NEXT: s_mov_b32 s62, s3 +; GCN-HSA-NEXT: s_lshr_b32 s64, s2, 16 +; GCN-HSA-NEXT: s_mov_b32 s66, s1 ; GCN-HSA-NEXT: s_lshr_b32 s68, s0, 16 ; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[2:3], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[34:35], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[24:25], 0x100000 ; GCN-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GCN-HSA-NEXT: s_ashr_i32 s40, s1, 16 -; GCN-HSA-NEXT: s_ashr_i32 s43, s5, 31 -; GCN-HSA-NEXT: s_ashr_i32 s70, s13, 31 -; GCN-HSA-NEXT: s_ashr_i32 s71, s13, 16 -; GCN-HSA-NEXT: s_ashr_i32 s72, s15, 31 -; GCN-HSA-NEXT: s_ashr_i32 s73, s15, 16 +; GCN-HSA-NEXT: s_ashr_i32 s36, s1, 16 +; GCN-HSA-NEXT: s_ashr_i32 s39, s5, 31 +; GCN-HSA-NEXT: s_ashr_i32 s40, s5, 16 +; GCN-HSA-NEXT: s_ashr_i32 s41, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s42, s7, 16 +; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31 +; GCN-HSA-NEXT: s_ashr_i32 s44, s9, 16 +; GCN-HSA-NEXT: s_ashr_i32 s45, s11, 31 ; GCN-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[4:5], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[6:7], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[8:9], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[10:11], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[12:13], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[14:15], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[70:71], s[10:11], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[72:73], s[12:13], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[74:75], s[14:15], 0x100000 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x100000 @@ -8510,149 +8518,149 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[54:55], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[54:55], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x100000 +; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 ; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 ; GCN-HSA-NEXT: s_add_u32 s54, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s55, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s47 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s47 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xc0 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s45 -; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s47 +; GCN-HSA-NEXT: s_add_u32 s46, s16, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s73 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s72 -; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s63 +; GCN-HSA-NEXT: s_addc_u32 s47, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s71 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 -; GCN-HSA-NEXT: s_add_u32 s38, s16, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s70 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s39 -; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s69 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s67 -; GCN-HSA-NEXT: flat_store_dwordx4 v[23:24], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[25:26], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 -; GCN-HSA-NEXT: s_add_u32 s36, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s37 -; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s37 -; GCN-HSA-NEXT: s_add_u32 s36, s16, 16 -; GCN-HSA-NEXT: s_addc_u32 s37, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30 -; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s74 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GCN-HSA-NEXT: s_add_u32 s30, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s75 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: s_addc_u32 s31, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s44 -; GCN-HSA-NEXT: flat_store_dwordx4 v[27:28], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 +; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 +; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 +; GCN-HSA-NEXT: s_add_u32 s26, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s59 +; GCN-HSA-NEXT: s_addc_u32 s27, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s72 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GCN-HSA-NEXT: s_add_u32 s24, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s73 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GCN-HSA-NEXT: s_addc_u32 s25, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s46 +; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s38 +; GCN-HSA-NEXT: s_add_u32 s14, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s43 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s36 -; GCN-HSA-NEXT: flat_store_dwordx4 v[29:30], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s70 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s71 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s29 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_add_u32 s12, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: s_add_u32 s12, s16, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s16, 0x80 +; GCN-HSA-NEXT: s_add_u32 s10, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 64 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s16, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll index b39b38a..b534c2c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll @@ -6398,41 +6398,41 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s10, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s8, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s16, s5 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s5, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s20, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s17 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s11 ; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64: @@ -6445,11 +6445,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s6, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s8, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s6, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s14, s3 ; GFX7-HSA-NEXT: s_ashr_i32 s5, s3, 31 ; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 @@ -6465,32 +6465,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32 +; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -6502,11 +6502,11 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s8, s3 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3 ; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s3, 31 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 @@ -6522,32 +6522,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32 +; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -6615,34 +6615,34 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out ; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshr_b32 s4, s3, 16 -; GFX12-NEXT: s_lshr_b32 s6, s3, 8 -; GFX12-NEXT: s_mov_b32 s8, s3 -; GFX12-NEXT: s_lshr_b32 s10, s2, 16 -; GFX12-NEXT: s_lshr_b32 s12, s2, 24 +; GFX12-NEXT: s_lshr_b32 s6, s2, 16 +; GFX12-NEXT: s_lshr_b32 s8, s2, 24 +; GFX12-NEXT: s_lshr_b32 s10, s2, 8 +; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX12-NEXT: s_lshr_b32 s12, s3, 8 +; GFX12-NEXT: s_mov_b32 s14, s3 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 ; GFX12-NEXT: s_ashr_i32 s15, s3, 31 ; GFX12-NEXT: s_ashr_i32 s18, s3, 24 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GFX12-NEXT: s_lshr_b32 s14, s2, 8 -; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s15 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s7 +; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v7, s11 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX12-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v5, s17 ; GFX12-NEXT: v_dual_mov_b32 v4, s16 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v9, s9 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[14:15], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s7 -; GFX12-NEXT: v_dual_mov_b32 v10, s6 :: v_dual_mov_b32 v13, s11 -; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13 -; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v7, s3 -; GFX12-NEXT: v_mov_b32_e32 v6, s2 +; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v13, s3 +; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_mov_b32_e32 v14, s12 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16 ; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] +; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32 ; GFX12-NEXT: s_endpgm %load = load <8 x i8>, ptr addrspace(4) %in %ext = sext <8 x i8> %load to <8 x i64> @@ -7033,80 +7033,81 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s14, s7 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s26, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s4, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s7, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s26, s7 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s8, s5 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[4:5], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s29, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s31, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s35, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s7, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s38, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s36 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s33 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s31 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s26 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s11 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s35 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s26 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s27 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s12 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s12 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s13 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s31 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s29 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s23 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s10 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s11 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64: @@ -7118,31 +7119,33 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s8, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s12, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s6, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s16, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s5, 8 -; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s31, s5, 24 -; GFX7-HSA-NEXT: s_mov_b32 s24, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s26, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s30, s4, 8 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[8:9], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s2, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s8, s6, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 +; GFX7-HSA-NEXT: s_ashr_i32 s27, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s29, s5, 24 +; GFX7-HSA-NEXT: s_mov_b32 s22, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s24, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s28, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000 ; GFX7-HSA-NEXT: s_ashr_i32 s33, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s36, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s34, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[26:27], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[8:9], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 @@ -7150,73 +7153,70 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s26, s0, 0x70 -; GFX7-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GFX7-HSA-NEXT: s_add_u32 s24, s0, 0x50 +; GFX7-HSA-NEXT: s_addc_u32 s25, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s11 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 0x50 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s16 ; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 64 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 48 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-HSA-NEXT: s_add_u32 s10, s0, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x70 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 0x60 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-HSA-NEXT: s_add_u32 s6, s0, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_add_u32 s6, s0, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX7-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: s_add_u32 s0, s0, 32 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s27 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm @@ -7225,107 +7225,109 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX8-NOHSA: ; %bb.0: ; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s6, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s30, s5, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s5, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s14, s5 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 8 -; GFX8-NOHSA-NEXT: s_ashr_i32 s19, s5, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s31, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s10, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s8, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s11, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s11, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s28, s11 +; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s9, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s9, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s2, s9 +; GFX8-NOHSA-NEXT: s_ashr_i32 s25, s9, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s29, s9, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[8:9], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[10:11], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[28:29], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s28, s11, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s11, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[24:25], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s30, s7, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s7, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[18:19], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x70 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x60 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 64 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 +; GFX8-NOHSA-NEXT: s_add_u32 s12, s0, 16 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s13, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 0x50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s33 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm @@ -7435,64 +7437,64 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s8, s7, 16 -; GFX12-NEXT: s_lshr_b32 s10, s7, 8 -; GFX12-NEXT: s_mov_b32 s12, s7 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GFX12-NEXT: s_ashr_i32 s33, s7, 31 -; GFX12-NEXT: s_ashr_i32 s36, s7, 24 +; GFX12-NEXT: s_lshr_b32 s2, s6, 16 +; GFX12-NEXT: s_lshr_b32 s8, s6, 24 +; GFX12-NEXT: s_lshr_b32 s10, s6, 8 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GFX12-NEXT: s_lshr_b32 s14, s6, 16 -; GFX12-NEXT: s_lshr_b32 s16, s6, 24 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX12-NEXT: s_lshr_b32 s12, s4, 16 +; GFX12-NEXT: s_lshr_b32 s14, s4, 24 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v3, s33 -; GFX12-NEXT: s_lshr_b32 s18, s6, 8 -; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s35 -; GFX12-NEXT: v_dual_mov_b32 v4, s34 :: v_dual_mov_b32 v1, s9 -; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v9, s13 -; GFX12-NEXT: s_lshr_b32 s20, s5, 16 -; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 +; GFX12-NEXT: s_lshr_b32 s16, s4, 8 +; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v9, s3 +; GFX12-NEXT: v_dual_mov_b32 v8, s2 :: v_dual_mov_b32 v11, s9 +; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v3, s11 +; GFX12-NEXT: s_lshr_b32 s18, s7, 16 ; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v11, s11 -; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s15 -; GFX12-NEXT: s_lshr_b32 s22, s5, 8 -; GFX12-NEXT: s_mov_b32 s24, s5 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v5, s31 +; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v13, s13 +; GFX12-NEXT: s_lshr_b32 s20, s7, 8 +; GFX12-NEXT: s_mov_b32 s22, s7 +; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX12-NEXT: s_lshr_b32 s24, s5, 16 +; GFX12-NEXT: s_ashr_i32 s33, s7, 31 +; GFX12-NEXT: s_ashr_i32 s36, s7, 24 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX12-NEXT: s_lshr_b32 s26, s4, 16 -; GFX12-NEXT: s_lshr_b32 s28, s4, 24 -; GFX12-NEXT: s_ashr_i32 s29, s5, 31 -; GFX12-NEXT: s_ashr_i32 s31, s5, 24 +; GFX12-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v15, s15 +; GFX12-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v7, s17 +; GFX12-NEXT: s_lshr_b32 s26, s5, 8 +; GFX12-NEXT: s_mov_b32 s28, s5 +; GFX12-NEXT: s_ashr_i32 s27, s5, 31 +; GFX12-NEXT: s_ashr_i32 s29, s5, 24 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s17 -; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v7, s19 -; GFX12-NEXT: s_lshr_b32 s30, s4, 8 +; GFX12-NEXT: v_mov_b32_e32 v6, s16 ; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: v_mov_b32_e32 v6, s18 -; GFX12-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 ; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64 +; GFX12-NEXT: v_dual_mov_b32 v0, s18 :: v_dual_mov_b32 v3, s33 +; GFX12-NEXT: v_dual_mov_b32 v1, s19 :: v_dual_mov_b32 v2, s36 +; GFX12-NEXT: v_mov_b32_e32 v9, s23 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[26:27], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s21 +; GFX12-NEXT: v_dual_mov_b32 v10, s20 :: v_dual_mov_b32 v17, s25 +; GFX12-NEXT: v_dual_mov_b32 v16, s24 :: v_dual_mov_b32 v19, s27 +; GFX12-NEXT: v_dual_mov_b32 v18, s29 :: v_dual_mov_b32 v21, s5 +; GFX12-NEXT: v_dual_mov_b32 v20, s4 :: v_dual_mov_b32 v23, s7 +; GFX12-NEXT: v_mov_b32_e32 v22, s6 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v3, s29 -; GFX12-NEXT: v_dual_mov_b32 v1, s21 :: v_dual_mov_b32 v2, s31 -; GFX12-NEXT: v_mov_b32_e32 v9, s25 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v11, s23 -; GFX12-NEXT: v_dual_mov_b32 v10, s22 :: v_dual_mov_b32 v17, s27 -; GFX12-NEXT: v_dual_mov_b32 v16, s26 :: v_dual_mov_b32 v19, s7 -; GFX12-NEXT: v_dual_mov_b32 v18, s6 :: v_dual_mov_b32 v21, s3 -; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s5 -; GFX12-NEXT: v_mov_b32_e32 v22, s4 -; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:80 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:48 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:32 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:32 ; GFX12-NEXT: s_endpgm %load = load <16 x i8>, ptr addrspace(4) %in %ext = sext <16 x i8> %load to <16 x i64> @@ -8204,157 +8206,157 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s7, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s7, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s30, s7 ; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s6, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s6, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s6, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s5, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s5, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s34, s5 -; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s4, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s4, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s6, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s4, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s22, s4, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s24, s4, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s10, s2, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s2, 24 +; GFX6-NOHSA-NEXT: s_lshr_b32 s14, s2, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s18, s0, 24 +; GFX6-NOHSA-NEXT: s_mov_b32 s34, s7 ; GFX6-NOHSA-NEXT: s_ashr_i32 s11, s1, 31 ; GFX6-NOHSA-NEXT: s_ashr_i32 s13, s1, 24 -; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s3, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[34:35], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s5, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s23, s5, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_ashr_i32 s25, s7, 31 -; GFX6-NOHSA-NEXT: s_ashr_i32 s27, s7, 24 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[14:15], 0x80000 -; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s3, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s3, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s44, s3 -; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s2, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s38, s2, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s2, 8 -; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s1, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s46, s1, 8 -; GFX6-NOHSA-NEXT: s_mov_b32 s52, s1 -; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s0, 16 -; GFX6-NOHSA-NEXT: s_lshr_b32 s50, s0, 24 -; GFX6-NOHSA-NEXT: s_lshr_b32 s54, s0, 8 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[14:15], s[0:1], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[62:63], s[4:5], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s15, s3, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s17, s3, 24 +; GFX6-NOHSA-NEXT: s_ashr_i32 s33, s5, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s49, s5, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[34:35], 0x80000 +; GFX6-NOHSA-NEXT: s_ashr_i32 s19, s7, 31 +; GFX6-NOHSA-NEXT: s_ashr_i32 s21, s7, 24 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[30:31], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[54:55], s[28:29], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s30, s7, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s34, s7, 8 +; GFX6-NOHSA-NEXT: s_lshr_b32 s36, s5, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s40, s5, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s46, s5 +; GFX6-NOHSA-NEXT: s_lshr_b32 s42, s3, 16 +; GFX6-NOHSA-NEXT: s_lshr_b32 s44, s3, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s50, s3 +; GFX6-NOHSA-NEXT: s_lshr_b32 s48, s1, 16 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[56:57], s[0:1], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[58:59], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[60:61], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_lshr_b32 s4, s1, 8 +; GFX6-NOHSA-NEXT: s_mov_b32 s6, s1 ; GFX6-NOHSA-NEXT: s_mov_b32 s0, s8 ; GFX6-NOHSA-NEXT: s_mov_b32 s1, s9 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s58 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s59 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s7 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s56 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s57 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s62 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s63 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s60 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s61 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s27 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s21 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s19 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s60 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s61 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s58 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s59 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s2 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s3 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s56 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s57 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s39 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s54 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s55 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s52 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s53 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s19 ; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 -; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v22, s6 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[26:27], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s8 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:208 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v23, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v24, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s25 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[54:55], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[50:51], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v25, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v26, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v27, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s25 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[6:7], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[22:23], s[50:51], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[24:25], s[46:47], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[6:7], s[48:49], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[8:9], s[44:45], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[26:27], s[42:43], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[40:41], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[38:39], s[40:41], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 ; GFX6-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 -; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[40:41], s[16:17], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[42:43], s[14:15], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000 +; GFX6-NOHSA-NEXT: s_bfe_i64 s[46:47], s[10:11], 0x80000 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s45 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s42 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s43 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s40 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s41 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s17 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s18 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s19 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:144 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s46 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s13 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s11 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s20 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s21 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s52 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s53 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s30 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s31 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:112 -; GFX6-NOHSA-NEXT: s_waitcnt expcnt(1) -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s14 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s15 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s49 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s42 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s43 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s40 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s41 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s18 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s19 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s24 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s25 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s28 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s29 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s17 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v9, s15 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v20, s30 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v21, s31 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:240 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s22 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v11, s23 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v18, s34 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v19, s35 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(2) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s13 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v15, s11 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s36 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s28 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s29 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s20 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s21 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s38 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s39 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:160 ; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s26 ; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s27 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s24 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s25 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s8 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v16, s6 -; GFX6-NOHSA-NEXT: v_mov_b32_e32 v17, s7 -; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s8 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s9 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0) +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v13, s7 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GFX6-NOHSA-NEXT: s_endpgm ; ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64: @@ -8366,211 +8368,212 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-HSA-NEXT: s_lshr_b32 s10, s7, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s40, s7, 8 -; GFX7-HSA-NEXT: s_mov_b32 s42, s7 -; GFX7-HSA-NEXT: s_lshr_b32 s44, s6, 16 -; GFX7-HSA-NEXT: s_ashr_i32 s41, s1, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s43, s3, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s45, s3, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s48, s6, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s50, s6, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s52, s5, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s46, s5, 8 -; GFX7-HSA-NEXT: s_mov_b32 s54, s5 -; GFX7-HSA-NEXT: s_lshr_b32 s38, s4, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s36, s4, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s34, s4, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s28, s3, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s24, s3, 8 -; GFX7-HSA-NEXT: s_mov_b32 s26, s3 -; GFX7-HSA-NEXT: s_lshr_b32 s22, s2, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s20, s2, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 8 -; GFX7-HSA-NEXT: s_lshr_b32 s14, s1, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s56, s1, 8 -; GFX7-HSA-NEXT: s_mov_b32 s12, s1 -; GFX7-HSA-NEXT: s_lshr_b32 s58, s0, 16 -; GFX7-HSA-NEXT: s_lshr_b32 s60, s0, 24 -; GFX7-HSA-NEXT: s_lshr_b32 s62, s0, 8 -; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s12, s6, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24 ; GFX7-HSA-NEXT: s_ashr_i32 s33, s1, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s66, s5, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s67, s5, 24 -; GFX7-HSA-NEXT: s_ashr_i32 s68, s7, 31 -; GFX7-HSA-NEXT: s_ashr_i32 s69, s7, 24 -; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[64:65], s[6:7], 0x80000 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[60:61], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[58:59], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[56:57], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s37, s1, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s34, s0, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s30, s0, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s28, s0, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s64, s1, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s66, s1, 8 +; GFX7-HSA-NEXT: s_mov_b32 s68, s1 +; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[12:13], 0x80000 +; GFX7-HSA-NEXT: s_lshr_b32 s36, s6, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s40, s4, 16 +; GFX7-HSA-NEXT: s_ashr_i32 s41, s3, 31 +; GFX7-HSA-NEXT: s_lshr_b32 s50, s4, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s52, s4, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s54, s2, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s56, s2, 24 +; GFX7-HSA-NEXT: s_lshr_b32 s42, s2, 8 +; GFX7-HSA-NEXT: s_lshr_b32 s26, s7, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s20, s7, 8 +; GFX7-HSA-NEXT: s_mov_b32 s24, s7 +; GFX7-HSA-NEXT: s_lshr_b32 s18, s5, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s14, s5, 8 +; GFX7-HSA-NEXT: s_mov_b32 s16, s5 +; GFX7-HSA-NEXT: s_lshr_b32 s58, s3, 16 +; GFX7-HSA-NEXT: s_lshr_b32 s60, s3, 8 +; GFX7-HSA-NEXT: s_mov_b32 s62, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x80000 +; GFX7-HSA-NEXT: s_ashr_i32 s44, s3, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s45, s5, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s46, s5, 24 +; GFX7-HSA-NEXT: s_ashr_i32 s47, s7, 31 +; GFX7-HSA-NEXT: s_ashr_i32 s48, s7, 24 +; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[2:3], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[70:71], s[4:5], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[72:73], s[6:7], 0x80000 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-HSA-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[6:7], s[60:61], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 ; GFX7-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[56:57], s[44:45], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[42:43], 0x80000 -; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[40:41], 0x80000 -; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xf0 +; GFX7-HSA-NEXT: s_bfe_i64 s[58:59], s[40:41], 0x80000 +; GFX7-HSA-NEXT: s_bfe_i64 s[60:61], s[36:37], 0x80000 +; GFX7-HSA-NEXT: s_add_u32 s62, s8, 0xd0 ; GFX7-HSA-NEXT: s_addc_u32 s63, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s58 -; GFX7-HSA-NEXT: s_add_u32 s58, s8, 0xe0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s59 -; GFX7-HSA-NEXT: s_addc_u32 s59, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xd0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s49 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xc0 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s62 -; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s49 -; GFX7-HSA-NEXT: s_add_u32 s48, s8, 0xb0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s63 -; GFX7-HSA-NEXT: s_addc_u32 s49, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s69 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s68 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s58 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3] ; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s38 -; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0xa0 +; GFX7-HSA-NEXT: s_add_u32 s60, s8, 0xc0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s61 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s59 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX7-HSA-NEXT: s_addc_u32 s61, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s50 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x90 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s51 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s51 +; GFX7-HSA-NEXT: s_add_u32 s50, s8, 0x80 +; GFX7-HSA-NEXT: s_addc_u32 s51, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s62 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s38 +; GFX7-HSA-NEXT: s_add_u32 s38, s8, 0x50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s63 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s39 ; GFX7-HSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s60 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s72 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s34 +; GFX7-HSA-NEXT: s_add_u32 s34, s8, 64 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s73 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s61 +; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s38 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s35 +; GFX7-HSA-NEXT: s_addc_u32 s35, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s58 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s59 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s54 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s55 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s56 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s57 +; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s39 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s56 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s30 -; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x90 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s31 -; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s30 -; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s31 -; GFX7-HSA-NEXT: s_add_u32 s30, s8, 0x80 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s57 -; GFX7-HSA-NEXT: s_addc_u32 s31, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s28 -; GFX7-HSA-NEXT: s_add_u32 s28, s8, 0x70 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s65 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s51 -; GFX7-HSA-NEXT: v_mov_b32_e32 v30, s48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s29 -; GFX7-HSA-NEXT: s_addc_u32 s29, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s52 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s53 -; GFX7-HSA-NEXT: v_mov_b32_e32 v31, s49 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s67 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s66 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s38 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s54 -; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s24 -; GFX7-HSA-NEXT: s_add_u32 s24, s8, 0x60 -; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s55 -; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s46 -; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s47 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s39 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s37 -; GFX7-HSA-NEXT: v_mov_b32_e32 v26, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s22 +; GFX7-HSA-NEXT: s_add_u32 s22, s8, 16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s23 +; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s25 -; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s28 -; GFX7-HSA-NEXT: s_addc_u32 s25, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s24 -; GFX7-HSA-NEXT: v_mov_b32_e32 v27, s31 -; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s34 -; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s35 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] -; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s26 -; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s45 -; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s43 -; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s27 -; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s29 -; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s25 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-HSA-NEXT: s_add_u32 s20, s8, 0x50 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-HSA-NEXT: s_addc_u32 s21, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX7-HSA-NEXT: s_add_u32 s16, s8, 64 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-HSA-NEXT: s_addc_u32 s17, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-HSA-NEXT: v_mov_b32_e32 v24, s34 +; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s23 +; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xf0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s31 +; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v18, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v28, s50 +; GFX7-HSA-NEXT: v_mov_b32_e32 v22, s42 +; GFX7-HSA-NEXT: v_mov_b32_e32 v23, s43 +; GFX7-HSA-NEXT: v_mov_b32_e32 v25, s35 +; GFX7-HSA-NEXT: v_mov_b32_e32 v19, s23 +; GFX7-HSA-NEXT: s_add_u32 s22, s8, 0xe0 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s70 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s71 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s52 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s53 +; GFX7-HSA-NEXT: v_mov_b32_e32 v29, s51 +; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s28 +; GFX7-HSA-NEXT: v_mov_b32_e32 v7, s29 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[24:25], v[20:23] +; GFX7-HSA-NEXT: s_addc_u32 s23, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v20, s22 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s26 +; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s27 +; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v11, s47 +; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s24 +; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s25 +; GFX7-HSA-NEXT: v_mov_b32_e32 v21, s23 +; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s20 +; GFX7-HSA-NEXT: v_mov_b32_e32 v15, s21 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GFX7-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX7-HSA-NEXT: s_add_u32 s18, s8, 0xb0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-HSA-NEXT: s_addc_u32 s19, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s46 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GFX7-HSA-NEXT: s_add_u32 s14, s8, 48 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-HSA-NEXT: s_add_u32 s14, s8, 0xa0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s15 ; GFX7-HSA-NEXT: s_addc_u32 s15, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s41 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s15 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-HSA-NEXT: s_add_u32 s10, s8, 32 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-HSA-NEXT: s_addc_u32 s11, s9, 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GFX7-HSA-NEXT: s_add_u32 s12, s8, 0x70 ; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GFX7-HSA-NEXT: s_addc_u32 s13, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s12 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s41 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s13 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_nop 0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-HSA-NEXT: s_add_u32 s4, s8, 16 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-HSA-NEXT: s_add_u32 s6, s8, 0x60 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GFX7-HSA-NEXT: s_addc_u32 s7, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-HSA-NEXT: s_add_u32 s4, s8, 48 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-HSA-NEXT: s_addc_u32 s5, s9, 0 ; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s37 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-HSA-NEXT: s_nop 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32 +; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0 +; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX7-HSA-NEXT: s_endpgm ; @@ -8580,140 +8583,175 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NOHSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 ; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s7, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s7, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s54, s7 -; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s6, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s6, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s6, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s5, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s5, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s46, s5 -; GFX8-NOHSA-NEXT: s_lshr_b32 s42, s4, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s4, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s38, s4, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s3, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s3, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s30, s3 -; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s2, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s26, s2, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s2, 8 -; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s1, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s1, 8 -; GFX8-NOHSA-NEXT: s_mov_b32 s18, s1 -; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s0, 16 -; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s0, 24 -; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s0, 8 -; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s3, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[2:3], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[44:45], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_lshr_b32 s46, s6, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s48, s6, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s50, s6, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s52, s4, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s54, s4, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s56, s4, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s58, s2, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s60, s2, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s40, s2, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s36, s0, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s34, s0, 24 +; GFX8-NOHSA-NEXT: s_lshr_b32 s28, s0, 8 +; GFX8-NOHSA-NEXT: s_lshr_b32 s62, s7, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s24, s7, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s22, s7 +; GFX8-NOHSA-NEXT: s_lshr_b32 s64, s5, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s5, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s18, s5 +; GFX8-NOHSA-NEXT: s_lshr_b32 s66, s3, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s3, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s14, s3 +; GFX8-NOHSA-NEXT: s_lshr_b32 s44, s1, 16 +; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s1, 8 +; GFX8-NOHSA-NEXT: s_mov_b32 s10, s1 +; GFX8-NOHSA-NEXT: s_ashr_i32 s63, s5, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[0:1], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[4:5], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[68:69], s[6:7], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s1, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s42, s1, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[44:45], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s43, s3, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s44, s3, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[66:67], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s1, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[66:67], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s45, s5, 31 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s7, 31 +; GFX8-NOHSA-NEXT: s_ashr_i32 s65, s7, 24 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[62:63], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s33, s3, 31 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[64:65], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s64, s5, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s5, s5, 24 -; GFX8-NOHSA-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX8-NOHSA-NEXT: s_ashr_i32 s66, s7, 31 -; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s7, 24 ; GFX8-NOHSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s50 -; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xf0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s51 -; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s66 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xe0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s54 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s55 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s52 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s53 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 -; GFX8-NOHSA-NEXT: s_add_u32 s50, s8, 0xd0 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX8-NOHSA-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xd0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0xc0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s68 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s69 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s50 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s51 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 +; GFX8-NOHSA-NEXT: s_add_u32 s46, s8, 0x90 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s51, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s56 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s57 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s58 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s59 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s51 +; GFX8-NOHSA-NEXT: s_addc_u32 s47, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s46 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s52 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s53 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s54 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s55 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s47 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s44 -; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xc0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s45 -; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s38 +; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x80 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s39 +; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s56 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s57 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 +; GFX8-NOHSA-NEXT: s_add_u32 s38, s8, 0x50 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s39, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s38 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s58 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s59 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s60 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s61 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xb0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s62 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s63 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 -; GFX8-NOHSA-NEXT: s_add_u32 s44, s8, 0xa0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_addc_u32 s45, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s44 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s46 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s47 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s48 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s49 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s39 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 +; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 +; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s40 -; GFX8-NOHSA-NEXT: s_add_u32 s40, s8, 0x90 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s41 -; GFX8-NOHSA-NEXT: s_addc_u32 s41, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s40 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s42 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s43 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s41 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 +; GFX8-NOHSA-NEXT: s_add_u32 s30, s8, 16 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: s_addc_u32 s31, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s30 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s36 -; GFX8-NOHSA-NEXT: s_add_u32 s36, s8, 0x80 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s37 -; GFX8-NOHSA-NEXT: s_addc_u32 s37, s9, 0 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s36 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s38 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s37 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s31 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s28 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s29 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xf0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s64 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: s_add_u32 s6, s8, 0xe0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s7, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_nop 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xb0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s63 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s45 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NOHSA-NEXT: s_add_u32 s4, s8, 0xa0 +; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NOHSA-NEXT: s_addc_u32 s5, s9, 0 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2 @@ -8723,33 +8761,15 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x60 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s65 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 0x50 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s30 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s31 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NOHSA-NEXT: s_add_u32 s2, s8, 64 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s28 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s29 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s26 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s27 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s44 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s43 ; GFX8-NOHSA-NEXT: s_addc_u32 s3, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s23 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s24 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s25 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_nop 0 @@ -8760,32 +8780,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s18 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s21 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s42 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s33 ; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s17 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s13 -; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NOHSA-NEXT: s_endpgm ; @@ -8984,122 +8988,120 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_load_b256 s[0:7], s[10:11], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_lshr_b32 s40, s7, 16 -; GFX12-NEXT: s_lshr_b32 s50, s6, 8 -; GFX12-NEXT: s_lshr_b32 s62, s3, 16 -; GFX12-NEXT: s_ashr_i32 s51, s3, 24 -; GFX12-NEXT: s_lshr_b32 s42, s7, 8 -; GFX12-NEXT: s_mov_b32 s44, s7 -; GFX12-NEXT: s_lshr_b32 s46, s6, 16 -; GFX12-NEXT: s_lshr_b32 s48, s6, 24 -; GFX12-NEXT: s_lshr_b32 s38, s5, 16 -; GFX12-NEXT: s_lshr_b32 s52, s5, 8 -; GFX12-NEXT: s_mov_b32 s54, s5 -; GFX12-NEXT: s_lshr_b32 s56, s4, 16 -; GFX12-NEXT: s_lshr_b32 s58, s4, 24 -; GFX12-NEXT: s_lshr_b32 s60, s4, 8 -; GFX12-NEXT: s_lshr_b32 s36, s3, 8 -; GFX12-NEXT: s_mov_b32 s34, s3 -; GFX12-NEXT: s_lshr_b32 s28, s2, 16 -; GFX12-NEXT: s_lshr_b32 s26, s2, 24 -; GFX12-NEXT: s_lshr_b32 s24, s2, 8 -; GFX12-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[30:31], s[4:5], 0x80000 +; GFX12-NEXT: s_lshr_b32 s34, s6, 16 +; GFX12-NEXT: s_lshr_b32 s36, s6, 24 +; GFX12-NEXT: s_lshr_b32 s38, s6, 8 +; GFX12-NEXT: s_lshr_b32 s40, s4, 16 +; GFX12-NEXT: s_lshr_b32 s42, s4, 24 +; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GFX12-NEXT: s_lshr_b32 s44, s4, 8 ; GFX12-NEXT: s_bfe_i64 s[66:67], s[6:7], 0x80000 -; GFX12-NEXT: s_ashr_i32 s39, s3, 31 -; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 -; GFX12-NEXT: s_ashr_i32 s62, s5, 31 -; GFX12-NEXT: s_ashr_i32 s63, s5, 24 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[50:51], 0x80000 -; GFX12-NEXT: s_ashr_i32 s50, s7, 31 +; GFX12-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s35 +; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GFX12-NEXT: s_ashr_i32 s7, s7, 24 +; GFX12-NEXT: v_dual_mov_b32 v0, s34 :: v_dual_mov_b32 v3, s37 +; GFX12-NEXT: v_dual_mov_b32 v2, s36 :: v_dual_mov_b32 v5, s67 +; GFX12-NEXT: s_lshr_b32 s28, s2, 16 +; GFX12-NEXT: s_lshr_b32 s46, s2, 24 +; GFX12-NEXT: s_bfe_i64 s[64:65], s[4:5], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v1, s41 -; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s66 :: v_dual_mov_b32 v7, s39 +; GFX12-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v9, s41 +; GFX12-NEXT: s_lshr_b32 s48, s2, 8 +; GFX12-NEXT: v_dual_mov_b32 v8, s40 :: v_dual_mov_b32 v11, s43 +; GFX12-NEXT: v_dual_mov_b32 v10, s42 :: v_dual_mov_b32 v13, s65 +; GFX12-NEXT: s_lshr_b32 s50, s0, 16 +; GFX12-NEXT: s_lshr_b32 s52, s0, 24 ; GFX12-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v3, s50 -; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s45 -; GFX12-NEXT: v_dual_mov_b32 v4, s44 :: v_dual_mov_b32 v7, s43 -; GFX12-NEXT: v_dual_mov_b32 v6, s42 :: v_dual_mov_b32 v9, s47 -; GFX12-NEXT: v_dual_mov_b32 v8, s46 :: v_dual_mov_b32 v11, s49 -; GFX12-NEXT: v_dual_mov_b32 v10, s48 :: v_dual_mov_b32 v13, s67 -; GFX12-NEXT: v_dual_mov_b32 v12, s66 :: v_dual_mov_b32 v15, s5 -; GFX12-NEXT: v_mov_b32_e32 v14, s4 -; GFX12-NEXT: s_bfe_i64 s[4:5], s[38:39], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s64 :: v_dual_mov_b32 v15, s45 +; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GFX12-NEXT: v_mov_b32_e32 v14, s44 +; GFX12-NEXT: s_lshr_b32 s54, s0, 8 +; GFX12-NEXT: s_bfe_i64 s[30:31], s[2:3], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GFX12-NEXT: s_lshr_b32 s56, s7, 16 +; GFX12-NEXT: s_lshr_b32 s58, s5, 16 +; GFX12-NEXT: s_lshr_b32 s60, s1, 8 +; GFX12-NEXT: s_mov_b32 s62, s1 +; GFX12-NEXT: s_ashr_i32 s57, s1, 24 +; GFX12-NEXT: s_ashr_i32 s59, s3, 31 +; GFX12-NEXT: s_ashr_i32 s61, s3, 24 +; GFX12-NEXT: s_ashr_i32 s63, s5, 31 ; GFX12-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 ; GFX12-NEXT: s_clause 0x3 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:240 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:224 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:208 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:192 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s62 -; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s63 -; GFX12-NEXT: v_mov_b32_e32 v5, s55 -; GFX12-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s54 :: v_dual_mov_b32 v7, s53 -; GFX12-NEXT: v_dual_mov_b32 v6, s52 :: v_dual_mov_b32 v9, s57 -; GFX12-NEXT: v_dual_mov_b32 v8, s56 :: v_dual_mov_b32 v11, s59 -; GFX12-NEXT: v_dual_mov_b32 v10, s58 :: v_dual_mov_b32 v13, s31 -; GFX12-NEXT: s_lshr_b32 s22, s1, 16 -; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s30 :: v_dual_mov_b32 v15, s61 -; GFX12-NEXT: v_dual_mov_b32 v14, s60 :: v_dual_mov_b32 v17, s3 -; GFX12-NEXT: s_lshr_b32 s16, s1, 8 -; GFX12-NEXT: s_mov_b32 s18, s1 -; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v16, s2 :: v_dual_mov_b32 v19, s39 -; GFX12-NEXT: v_dual_mov_b32 v18, s51 :: v_dual_mov_b32 v21, s35 -; GFX12-NEXT: s_lshr_b32 s14, s0, 16 -; GFX12-NEXT: s_lshr_b32 s12, s0, 24 -; GFX12-NEXT: s_ashr_i32 s6, s1, 31 -; GFX12-NEXT: s_ashr_i32 s33, s1, 24 -; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v23, s37 -; GFX12-NEXT: v_mov_b32_e32 v22, s36 -; GFX12-NEXT: s_clause 0x5 -; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 -; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:208 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:192 ; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:144 ; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:128 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:112 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:96 -; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s27 -; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s26 -; GFX12-NEXT: v_mov_b32_e32 v5, s21 -; GFX12-NEXT: s_lshr_b32 s64, s0, 8 +; GFX12-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v3, s47 +; GFX12-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s46 +; GFX12-NEXT: v_mov_b32_e32 v5, s31 +; GFX12-NEXT: s_lshr_b32 s26, s7, 8 +; GFX12-NEXT: s_mov_b32 s24, s7 +; GFX12-NEXT: s_bfe_i64 s[22:23], s[0:1], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s30 :: v_dual_mov_b32 v7, s49 +; GFX12-NEXT: v_dual_mov_b32 v6, s48 :: v_dual_mov_b32 v9, s51 +; GFX12-NEXT: s_lshr_b32 s18, s5, 8 +; GFX12-NEXT: s_mov_b32 s20, s5 +; GFX12-NEXT: s_lshr_b32 s16, s3, 16 +; GFX12-NEXT: s_lshr_b32 s12, s3, 8 +; GFX12-NEXT: s_mov_b32 s14, s3 +; GFX12-NEXT: s_lshr_b32 s10, s1, 16 +; GFX12-NEXT: s_ashr_i32 s33, s1, 31 +; GFX12-NEXT: s_bfe_i64 s[2:3], s[62:63], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[0:1], s[60:61], 0x80000 +; GFX12-NEXT: s_ashr_i32 s60, s5, 24 +; GFX12-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x80000 +; GFX12-NEXT: s_ashr_i32 s58, s7, 31 +; GFX12-NEXT: s_ashr_i32 s62, s7, 24 +; GFX12-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s50 :: v_dual_mov_b32 v11, s53 +; GFX12-NEXT: v_dual_mov_b32 v10, s52 :: v_dual_mov_b32 v13, s23 +; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v12, s22 :: v_dual_mov_b32 v15, s55 +; GFX12-NEXT: v_dual_mov_b32 v14, s54 :: v_dual_mov_b32 v17, s7 +; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v16, s6 :: v_dual_mov_b32 v19, s58 +; GFX12-NEXT: v_dual_mov_b32 v18, s62 :: v_dual_mov_b32 v21, s25 ; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s25 -; GFX12-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v9, s23 -; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v8, s22 :: v_dual_mov_b32 v11, s6 -; GFX12-NEXT: v_dual_mov_b32 v10, s33 :: v_dual_mov_b32 v13, s19 -; GFX12-NEXT: s_bfe_i64 s[10:11], s[0:1], 0x80000 -; GFX12-NEXT: s_bfe_i64 s[0:1], s[64:65], 0x80000 -; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s17 -; GFX12-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v17, s15 -; GFX12-NEXT: v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v19, s13 -; GFX12-NEXT: v_dual_mov_b32 v18, s12 :: v_dual_mov_b32 v21, s11 -; GFX12-NEXT: v_dual_mov_b32 v20, s10 :: v_dual_mov_b32 v23, s1 -; GFX12-NEXT: v_mov_b32_e32 v22, s0 +; GFX12-NEXT: v_dual_mov_b32 v20, s24 :: v_dual_mov_b32 v23, s27 +; GFX12-NEXT: v_mov_b32_e32 v22, s26 ; GFX12-NEXT: s_clause 0x5 ; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:80 ; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:64 -; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:48 -; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:32 -; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:16 -; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:16 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:240 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:224 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s63 +; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s60 +; GFX12-NEXT: v_mov_b32_e32 v5, s21 +; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v7, s19 +; GFX12-NEXT: v_dual_mov_b32 v6, s18 :: v_dual_mov_b32 v9, s17 +; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GFX12-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v11, s59 +; GFX12-NEXT: v_dual_mov_b32 v10, s61 :: v_dual_mov_b32 v13, s15 +; GFX12-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v15, s13 +; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s11 +; GFX12-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v19, s33 +; GFX12-NEXT: v_dual_mov_b32 v18, s57 :: v_dual_mov_b32 v21, s3 +; GFX12-NEXT: v_dual_mov_b32 v20, s2 :: v_dual_mov_b32 v23, s1 +; GFX12-NEXT: v_mov_b32_e32 v22, s0 +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: global_store_b128 v24, v[0:3], s[8:9] offset:176 +; GFX12-NEXT: global_store_b128 v24, v[4:7], s[8:9] offset:160 +; GFX12-NEXT: global_store_b128 v24, v[8:11], s[8:9] offset:112 +; GFX12-NEXT: global_store_b128 v24, v[12:15], s[8:9] offset:96 +; GFX12-NEXT: global_store_b128 v24, v[16:19], s[8:9] offset:48 +; GFX12-NEXT: global_store_b128 v24, v[20:23], s[8:9] offset:32 ; GFX12-NEXT: s_endpgm %load = load <32 x i8>, ptr addrspace(4) %in %ext = sext <32 x i8> %load to <32 x i64> @@ -9832,24 +9834,50 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; EG-LABEL: constant_zextload_v4i8_to_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1 +; EG-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: AND_INT T0.W, T7.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T0.W, T7.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T4.Y, T4.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T0.W, T7.X, literal.x, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, T4.X, literal.x, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, -; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T8.Y, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV * T8.X, T4.X, ; ; GFX12-LABEL: constant_zextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: @@ -9951,23 +9979,56 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out ; ; EG-LABEL: constant_sextload_v4i8_to_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1 +; EG-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x, -; EG-NEXT: LSHR T0.W, T4.X, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) -; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T7.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T8.Y, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV * T8.X, T4.X, ; ; GFX12-LABEL: constant_sextload_v4i8_to_v4i16: ; GFX12: ; %bb.0: @@ -10088,27 +10149,80 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; EG-LABEL: constant_zextload_v8i8_to_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 +; EG-NEXT: ALU 61, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T5.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: AND_INT T0.W, T11.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T0.W, T11.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T6.W, T5.Y, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), -65536(nan) +; EG-NEXT: OR_INT * T1.W, PS, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T11.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T12.Y, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T11.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T11.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T6.Y, T5.X, literal.x, T0.W, -; EG-NEXT: AND_INT * T6.Z, T5.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) -; EG-NEXT: AND_INT T6.X, T5.X, literal.x, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, -; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: BFE_UINT * T0.W, T11.Y, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T12.W, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T12.X, T8.X, +; EG-NEXT: MOV * T12.Z, T4.X, ; ; GFX12-LABEL: constant_zextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: @@ -10255,28 +10369,93 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out ; ; EG-LABEL: constant_sextload_v8i8_to_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 +; EG-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T5.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.Y, literal.x, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T11.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.X, literal.x, +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T11.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T12.Y, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T11.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T12.W, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T12.X, T8.X, +; EG-NEXT: MOV * T12.Z, T4.X, ; ; GFX12-LABEL: constant_sextload_v8i8_to_v8i16: ; GFX12: ; %bb.0: @@ -10472,37 +10651,146 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; EG-LABEL: constant_zextload_v16i8_to_v16i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1 +; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @8 +; EG-NEXT: ALU 103, @12, KC0[], KC1[] +; EG-NEXT: ALU 20, @116, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T7.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.Y, T16.X, +; EG-NEXT: MOV * T19.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: AND_INT T0.W, T19.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T16.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T0.W, T19.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T8.W, T7.Y, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T19.X, literal.x, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), -65536(nan) +; EG-NEXT: OR_INT * T1.W, PS, PV.W, +; EG-NEXT: MOV * T17.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T19.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T8.Y, T7.X, literal.x, T0.W, -; EG-NEXT: AND_INT T8.Z, T7.Y, literal.y, -; EG-NEXT: BFE_UINT * T9.W, T7.W, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) -; EG-NEXT: AND_INT T8.X, T7.X, literal.x, -; EG-NEXT: BFE_UINT T9.Y, T7.Z, literal.y, T0.W, -; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT * T9.Z, T7.W, literal.x, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T9.X, T7.Z, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) -; EG-NEXT: LSHR * T10.X, PV.W, literal.x, +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T20.Y, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T12.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T19.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T12.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T19.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T13.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T19.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T20.W, PV.W, PS, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T19.Z, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T19.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T19.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T19.Y, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T19.W, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T19.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 116: +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR T0.W, T19.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) +; EG-NEXT: LSHR T21.X, PS, literal.x, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.y, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.z, +; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: 16711680(2.341805e-38), 0(0.000000e+00) +; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T19.W, PV.W, PS, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T20.X, T16.X, +; EG-NEXT: MOV * T20.Z, T12.X, +; EG-NEXT: MOV T19.X, T8.X, +; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 ; ; GFX12-LABEL: constant_zextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: @@ -10753,38 +11041,173 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o ; ; EG-LABEL: constant_sextload_v16i8_to_v16i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1 +; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @8 +; EG-NEXT: ALU 104, @12, KC0[], KC1[] +; EG-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T7.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT * T8.Z, T7.Y, 0.0, literal.x, +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.Y, T16.X, +; EG-NEXT: MOV * T19.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T16.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T19.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T8.X, T7.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T9.Z, T7.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T7.Y, literal.x, +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T17.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T19.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T20.Y, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.Y, +; EG-NEXT: MOV T0.Y, T12.X, +; EG-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T9.X, T7.Z, 0.0, literal.x, -; EG-NEXT: LSHR T0.Z, T7.W, literal.x, -; EG-NEXT: BFE_INT T8.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T7.X, literal.x, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T12.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT T8.Y, PS, 0.0, literal.y, -; EG-NEXT: LSHR T1.Z, T7.Z, literal.y, -; EG-NEXT: BFE_INT T9.W, PV.Z, 0.0, literal.y, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T10.X, PS, literal.x, -; EG-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T13.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T19.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T20.W, PV.W, PS, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV T0.Y, T8.X, +; EG-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T19.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: ALU clause starting at 117: +; EG-NEXT: OR_INT * T19.Y, T1.W, T0.W, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T19.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T19.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR T0.W, T19.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) +; EG-NEXT: LSHR T21.X, PS, literal.x, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.y, +; EG-NEXT: LSHL * T0.W, PV.W, literal.z, +; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T19.W, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T20.X, T16.X, +; EG-NEXT: MOV * T20.Z, T12.X, +; EG-NEXT: MOV T19.X, T8.X, +; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 ; ; GFX12-LABEL: constant_sextload_v16i8_to_v16i16: ; GFX12: ; %bb.0: @@ -11132,58 +11555,276 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; EG-LABEL: constant_zextload_v32i8_to_v32i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @8 -; EG-NEXT: ALU 37, @13, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1 +; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @10 +; EG-NEXT: ALU 103, @16, KC0[], KC1[] +; EG-NEXT: ALU 104, @120, KC0[], KC1[] +; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 -; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MOV * T11.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 13: +; EG-NEXT: Fetch clause starting at 10: +; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: MOV * T0.Y, T16.X, +; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 16: +; EG-NEXT: AND_INT T0.W, T37.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T16.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T0.W, T37.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T13.W, T11.Y, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), -65536(nan) +; EG-NEXT: OR_INT * T1.W, PS, PV.W, +; EG-NEXT: MOV * T17.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T37.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W, -; EG-NEXT: AND_INT T13.Z, T11.Y, literal.y, -; EG-NEXT: BFE_UINT * T14.W, T11.W, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) -; EG-NEXT: AND_INT T13.X, T11.X, literal.x, -; EG-NEXT: BFE_UINT T14.Y, T11.Z, literal.y, T0.W, -; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T14.Z, T11.W, literal.x, -; EG-NEXT: BFE_UINT * T15.W, T12.Y, literal.y, T0.W, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: AND_INT T14.X, T11.Z, literal.x, -; EG-NEXT: BFE_UINT T15.Y, T12.X, literal.y, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T36.Y, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T12.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T12.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T37.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T16.X, PV.W, literal.x, -; EG-NEXT: AND_INT T15.Z, T12.Y, literal.y, -; EG-NEXT: BFE_UINT T17.W, T12.W, literal.z, T0.W, -; EG-NEXT: AND_INT * T15.X, T12.X, literal.y, -; EG-NEXT: 2(2.802597e-45), 255(3.573311e-43) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T13.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T37.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T17.Y, T12.Z, literal.x, T0.W, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T12.X, PV.W, literal.x, -; EG-NEXT: AND_INT T17.Z, T12.W, literal.y, -; EG-NEXT: AND_INT * T17.X, T12.Z, literal.y, -; EG-NEXT: 2(2.802597e-45), 255(3.573311e-43) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T36.W, PV.W, PS, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T37.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T37.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T37.Y, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T37.W, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T37.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 120: +; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T37.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T37.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T0.Y, T32.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T35.X, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T32.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T35.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T32.X, PV.W, +; EG-NEXT: MOV T0.Y, T33.X, +; EG-NEXT: BFE_UINT * T1.W, T35.X, literal.x, T0.W, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T33.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T35.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T38.Y, PV.W, PS, +; EG-NEXT: MOV T33.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T28.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T35.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T28.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T35.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T28.X, PV.W, +; EG-NEXT: MOV T0.Y, T29.X, +; EG-NEXT: BFE_UINT * T1.W, T35.Y, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T29.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T35.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T38.W, PV.W, PS, +; EG-NEXT: MOV T29.X, PV.W, +; EG-NEXT: MOV * T0.Y, T24.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T35.Z, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T24.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T35.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T24.X, PV.W, +; EG-NEXT: MOV T0.Y, T25.X, +; EG-NEXT: BFE_UINT * T1.W, T35.Z, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T25.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T35.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T35.Y, PV.W, PS, +; EG-NEXT: MOV T25.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T20.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T35.W, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T20.X, PV.W, +; EG-NEXT: ALU clause starting at 225: +; EG-NEXT: MOV T0.Y, T20.X, +; EG-NEXT: LSHL * T1.W, T35.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T20.X, PV.W, +; EG-NEXT: MOV T0.Y, T21.X, +; EG-NEXT: BFE_UINT * T0.W, T35.W, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, +; EG-NEXT: MOV * T21.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) -; EG-NEXT: LSHR * T18.X, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T39.X, PV.W, literal.x, +; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LSHR T0.W, T35.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) +; EG-NEXT: LSHR T41.X, PS, literal.x, +; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y, +; EG-NEXT: AND_INT T0.W, PV.W, literal.z, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, +; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: 16711680(2.341805e-38), 32(4.484155e-44) +; EG-NEXT: LSHR T42.X, PS, literal.x, +; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T21.X, PV.W, +; EG-NEXT: MOV * T36.X, T16.X, +; EG-NEXT: MOV * T36.Z, T12.X, +; EG-NEXT: MOV T37.X, T8.X, +; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T38.X, T32.X, +; EG-NEXT: MOV * T38.Z, T28.X, +; EG-NEXT: MOV T35.X, T24.X, +; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 ; ; GFX12-LABEL: constant_zextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: @@ -11642,60 +12283,331 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o ; ; EG-LABEL: constant_sextload_v32i8_to_v32i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @8 -; EG-NEXT: ALU 39, @13, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T13.X, 1 +; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @10 +; EG-NEXT: ALU 104, @16, KC0[], KC1[] +; EG-NEXT: ALU 104, @121, KC0[], KC1[] +; EG-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 -; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MOV * T11.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 13: -; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T14.X, PV.W, literal.x, -; EG-NEXT: BFE_INT * T15.Z, T11.Y, 0.0, literal.y, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; EG-NEXT: BFE_INT T15.X, T11.X, 0.0, literal.x, -; EG-NEXT: LSHR T0.Y, T12.W, literal.x, -; EG-NEXT: BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR T0.W, T12.Y, literal.x, -; EG-NEXT: LSHR * T1.W, T11.Y, literal.x, +; EG-NEXT: Fetch clause starting at 10: +; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: MOV * T0.Y, T16.X, +; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 16: +; EG-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T16.X, T11.Z, 0.0, literal.x, -; EG-NEXT: LSHR T1.Y, T11.W, literal.x, -; EG-NEXT: BFE_INT T17.Z, T12.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T15.W, PS, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T11.X, literal.x, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T16.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T37.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T17.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T37.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T36.Y, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.Y, +; EG-NEXT: MOV T0.Y, T12.X, +; EG-NEXT: BFE_INT * T0.W, T37.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T17.X, T12.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T15.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T18.Z, T12.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T16.W, PV.Y, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T11.Z, literal.x, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T12.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T37.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T18.X, T12.Z, 0.0, literal.x, -; EG-NEXT: BFE_INT T16.Y, PS, 0.0, literal.x, -; EG-NEXT: LSHR T0.Z, T12.X, literal.x, -; EG-NEXT: BFE_INT T17.W, T0.W, 0.0, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T11.X, PS, literal.x, -; EG-NEXT: BFE_INT T17.Y, PV.Z, 0.0, literal.y, -; EG-NEXT: LSHR T0.Z, T12.Z, literal.y, -; EG-NEXT: BFE_INT T18.W, T0.Y, 0.0, literal.y, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T12.X, PS, literal.x, -; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: LSHR * T0.W, T37.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T13.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T37.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T36.W, PV.W, PS, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV T0.Y, T8.X, +; EG-NEXT: BFE_INT * T0.W, T37.Z, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T37.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T37.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T37.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: ALU clause starting at 121: +; EG-NEXT: OR_INT * T37.Y, T1.W, T0.W, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T37.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T37.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T37.W, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T37.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV T0.Y, T32.X, +; EG-NEXT: BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T32.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T35.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T32.X, PV.W, +; EG-NEXT: MOV T0.Y, T33.X, +; EG-NEXT: LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T33.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T35.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T38.Y, PV.W, PS, +; EG-NEXT: MOV T33.X, PV.Y, +; EG-NEXT: MOV T0.Y, T28.X, +; EG-NEXT: BFE_INT * T0.W, T35.Y, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T28.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T35.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T28.X, PV.W, +; EG-NEXT: MOV T0.Y, T29.X, +; EG-NEXT: LSHR * T0.W, T35.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T29.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T35.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 226: +; EG-NEXT: AND_INT T1.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T0.W, T0.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T38.W, PV.W, PS, +; EG-NEXT: MOV T29.X, PV.W, +; EG-NEXT: MOV T0.Y, T24.X, +; EG-NEXT: BFE_INT * T0.W, T35.Z, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T24.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T35.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T24.X, PV.W, +; EG-NEXT: MOV T0.Y, T25.X, +; EG-NEXT: LSHR * T0.W, T35.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T25.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T35.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T35.Y, PV.W, PS, +; EG-NEXT: MOV T25.X, PV.Y, +; EG-NEXT: MOV T0.Y, T20.X, +; EG-NEXT: BFE_INT * T0.W, T35.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T20.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T35.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T20.X, PV.W, +; EG-NEXT: MOV T0.Y, T21.X, +; EG-NEXT: LSHR * T0.W, T35.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T21.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T39.X, PV.W, literal.x, +; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ASHR T0.W, T35.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 24(3.363116e-44), 48(6.726233e-44) +; EG-NEXT: LSHR T41.X, PS, literal.x, +; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y, +; EG-NEXT: LSHL T0.W, PV.W, literal.z, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, +; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) +; EG-NEXT: LSHR T42.X, PS, literal.x, +; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T21.X, PV.W, +; EG-NEXT: MOV * T36.X, T16.X, +; EG-NEXT: MOV * T36.Z, T12.X, +; EG-NEXT: MOV T37.X, T8.X, +; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T38.X, T32.X, +; EG-NEXT: MOV * T38.Z, T28.X, +; EG-NEXT: MOV T35.X, T24.X, +; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 ; ; GFX12-LABEL: constant_sextload_v32i8_to_v32i16: ; GFX12: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 3753737..09d3c3b 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -263,63 +263,74 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac ; ; EG-LABEL: global_load_v3i16: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 -; EG-NEXT: MEM_RAT MSKOR T2.XW, T0.X +; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 2 @6 +; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0 +; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X ; EG-NEXT: CF_END ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T1.X, T0.X, 0, #1 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1 +; EG-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1 +; EG-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1 +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: MOV * T5.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 13: ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: AND_INT T1.W, PV.W, literal.x, -; EG-NEXT: AND_INT * T2.W, T0.X, literal.y, +; EG-NEXT: AND_INT * T2.W, T5.X, literal.y, ; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) ; EG-NEXT: LSHL * T1.W, PV.W, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T2.X, T2.W, PV.W, -; EG-NEXT: LSHL * T2.W, literal.x, PV.W, +; EG-NEXT: LSHL T5.X, T2.W, PV.W, +; EG-NEXT: LSHL * T5.W, literal.x, PV.W, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MOV T2.Y, 0.0, -; EG-NEXT: MOV * T2.Z, 0.0, -; EG-NEXT: LSHR T0.X, T0.W, literal.x, -; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV T5.Y, 0.0, +; EG-NEXT: MOV * T5.Z, 0.0, +; EG-NEXT: LSHR T8.X, T0.W, literal.x, +; EG-NEXT: LSHL T0.W, T7.X, literal.y, +; EG-NEXT: AND_INT * T1.W, T6.X, literal.z, +; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT T6.X, PV.W, PS, +; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CM-LABEL: global_load_v3i16: ; CM: ; %bb.0: ; %entry -; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] -; CM-NEXT: TEX 1 @6 -; CM-NEXT: ALU 15, @11, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT MSKOR T2.XW, T3.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] +; CM-NEXT: TEX 2 @6 +; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X ; CM-NEXT: CF_END ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_16 T1.X, T0.X, 0, #1 -; CM-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 -; CM-NEXT: ALU clause starting at 10: -; CM-NEXT: MOV * T0.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 11: +; CM-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1 +; CM-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1 +; CM-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1 +; CM-NEXT: ALU clause starting at 12: +; CM-NEXT: MOV * T5.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 13: ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; CM-NEXT: AND_INT * T1.W, PV.W, literal.x, ; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; CM-NEXT: AND_INT T0.Z, T0.X, literal.x, +; CM-NEXT: AND_INT T0.Z, T5.X, literal.x, ; CM-NEXT: LSHL * T1.W, PV.W, literal.y, ; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45) -; CM-NEXT: LSHL T2.X, PV.Z, PV.W, -; CM-NEXT: LSHL * T2.W, literal.x, PV.W, +; CM-NEXT: LSHL T5.X, PV.Z, PV.W, +; CM-NEXT: LSHL * T5.W, literal.x, PV.W, ; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; CM-NEXT: MOV T2.Y, 0.0, -; CM-NEXT: MOV * T2.Z, 0.0, -; CM-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: MOV T5.Y, 0.0, +; CM-NEXT: MOV * T5.Z, 0.0, +; CM-NEXT: LSHL T0.Z, T7.X, literal.x, +; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W, +; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: LSHR * T3.X, T0.W, literal.x, +; CM-NEXT: LSHR * T8.X, T0.W, literal.x, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %ld = load <3 x i16>, ptr addrspace(1) %in @@ -1738,8 +1749,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 @@ -1758,8 +1769,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] @@ -6365,8 +6376,8 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 @@ -6390,28 +6401,28 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 ; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v5, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 @@ -6420,9 +6431,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -6964,58 +6975,59 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v3 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v3 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 16, v5 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v5, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 31, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 16, v7 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v18, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v17, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v9, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v7 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v5 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v5 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v5, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64: @@ -7037,31 +7049,31 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 @@ -7069,36 +7081,36 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v5 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v9, v16, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v9, v18, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; GCN-HSA-NEXT: v_bfe_i32 v6, v17, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v6, v19, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[7:10] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10] ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 16, v1 ; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v16, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v2, v18, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v15, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 @@ -7107,9 +7119,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[4:7] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; @@ -8100,113 +8112,115 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32 -; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 +; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3) +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v15 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v12 +; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v3 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, v15 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v8 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:240 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v7 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v4 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v23, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v15 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v1 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v1, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v12 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v13 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v13 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v13, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v11 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v3 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v1 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v1 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v22, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v26, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v5 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v5, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v11 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v27, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v13, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v12, 0, 16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v9 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 16, v9 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v14, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v21, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v20, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v1, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v8, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v15 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v13 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v13, 0, 16 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v12, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v14, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v3, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v20, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v9 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v9, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v10, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:224 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v8, 0, 16 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v10, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v19, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v18, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v17, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v16, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v6, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v23, v2, 0, 16 -; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v0, 0, 16 -; GCN-NOHSA-SI-NEXT: v_bfe_i32 v27, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v4, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v6, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v19, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v18, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v17, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v16, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v0, 0, 16 +; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v2, 0, 16 +; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v0, 0, 16 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v26, 31, v25 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 31, v28 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v28, 31, v27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 31, v22 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 31, v30 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:128 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64: @@ -8218,180 +8232,179 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v7 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_waitcnt vmcnt(4) -; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v1 +; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 -; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 -; GCN-HSA-NEXT: s_waitcnt vmcnt(5) -; GCN-HSA-NEXT: v_bfe_i32 v16, v9, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v9 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: v_bfe_i32 v18, v13, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[13:16] +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: v_bfe_i32 v12, v12, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v14, v14, 0, 16 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) -; GCN-HSA-NEXT: v_bfe_i32 v16, v13, 0, 16 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v13 +; GCN-HSA-NEXT: v_bfe_i32 v12, v9, 0, 16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v28, 16, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, v11 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v9 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_bfe_i32 v16, v29, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v14, v18, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v10, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v28, 0, 16 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 16, v11 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: s_waitcnt vmcnt(5) +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 16, v2 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[16:19] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v15 -; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v15 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v10, v20, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, v3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_bfe_i32 v8, v21, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v4 +; GCN-HSA-NEXT: v_bfe_i32 v12, v4, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v1, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 16, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11] +; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v5 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v26, v26, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 31, v24 +; GCN-HSA-NEXT: v_bfe_i32 v14, v27, 0, 16 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 31, v26 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_bfe_i32 v20, v1, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v23, 31, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 16, v1 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v21, 31, v20 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, v7 +; GCN-HSA-NEXT: v_bfe_i32 v18, v18, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[19:22] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GCN-HSA-NEXT: v_bfe_i32 v0, v19, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v21, 16, v10 -; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] -; GCN-HSA-NEXT: v_bfe_i32 v23, v2, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v19, v10, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v2, v25, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v25, v9, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 -; GCN-HSA-NEXT: v_bfe_i32 v13, v4, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v15, v3, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v24, 31, v23 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v25 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; GCN-HSA-NEXT: v_bfe_i32 v11, v14, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[23:26] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[3:4], v[13:16] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v22, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v21, v21, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v20, 31, v19 -; GCN-HSA-NEXT: v_bfe_i32 v17, v22, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v22, 31, v21 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 -; GCN-HSA-NEXT: v_bfe_i32 v15, v8, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[19:22] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 31, v15 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v5, v12, 0, 16 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 31, v11 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 31, v13 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[5:8] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll index 5bc02c4..f879dc6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll @@ -6274,12 +6274,12 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s4, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s5, 8 -; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 8 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s4, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s4, 24 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 @@ -6294,19 +6294,19 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s13 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v8i8_to_v8i64: @@ -6325,11 +6325,12 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1 ; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v0 ; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 8 -; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 24 -; GCN-HSA-NEXT: s_lshr_b32 s12, s3, 8 +; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 24 +; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 8 +; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 8 ; GCN-HSA-NEXT: s_ashr_i32 s13, s2, 31 +; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_ashr_i32 s16, s2, 24 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000 @@ -6337,38 +6338,37 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s5 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v8i8_to_v8i64: @@ -6388,10 +6388,10 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v0 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 8 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s4, 31 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s4, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 @@ -6408,18 +6408,18 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v8i8_to_v8i64: @@ -6934,84 +6934,85 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 ; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, s5 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s4, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s9, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s9 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s8, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 8 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s9, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s9, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s35, s5, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31 ; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i64: @@ -7024,41 +7025,41 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2 -; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3 ; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0 -; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 8 -; GCN-HSA-NEXT: s_mov_b32 s10, s3 -; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s2, 24 -; GCN-HSA-NEXT: s_lshr_b32 s16, s2, 8 +; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 +; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24 +; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8 +; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8 +; GCN-HSA-NEXT: s_mov_b32 s22, s3 ; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31 ; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s18, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s20, s5, 8 -; GCN-HSA-NEXT: s_mov_b32 s22, s5 +; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24 +; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16 ; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_ashr_i32 s7, s5, 24 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 -; GCN-HSA-NEXT: s_lshr_b32 s24, s4, 24 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 8 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 -; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8 +; GCN-HSA-NEXT: s_mov_b32 s24, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 @@ -7069,66 +7070,66 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_add_u32 s6, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s15 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i64: @@ -7142,83 +7143,84 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s5, 16 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s5, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s20, s5 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s4, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s4, 8 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s35, s5, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s9, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s9, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s8, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s8, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s7, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s9, 31 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s7, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s7, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[8:9], 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s9, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[34:35], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[30:31], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s19 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s29 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s13 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i8_to_v16i64: @@ -8174,166 +8176,166 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s39, v3 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s38, v2 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s37, v1 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s36, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1 ; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v7 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v5 -; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v4 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s39, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s39, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s42, s39 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s38, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s38, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s38, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s37, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s37, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s40, s37 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s36, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s36, 24 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s36, 8 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s7, 8 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s7 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s39, 31 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s39, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[38:39], 0x80000 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4 +; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s6, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s6, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s37, 31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s37, 24 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s41 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s6, 8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s37 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s11, 16 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s31 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s11, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s34, s11 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:240 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s29 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s26 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s27 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s10, 24 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s10, 8 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s7, 31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s7, 24 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s9 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s11, 31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5 -; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s11, 24 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[10:11], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[46:47], s[6:7], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[22:23], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[24:25], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x80000 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[44:45], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[40:41], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[42:43], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s9 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:160 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s15 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:144 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s17 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:128 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s19 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s21 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s37 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:80 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s24 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s25 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32 -; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s7 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1) +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64: @@ -8346,225 +8348,223 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 -; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) -; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v9 -; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v8 -; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v7 -; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v6 -; GCN-HSA-NEXT: s_lshr_b32 s16, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s7, 8 -; GCN-HSA-NEXT: s_mov_b32 s24, s7 -; GCN-HSA-NEXT: s_lshr_b32 s22, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s6, 24 -; GCN-HSA-NEXT: s_lshr_b32 s2, s6, 8 -; GCN-HSA-NEXT: s_lshr_b32 s4, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s9, 8 -; GCN-HSA-NEXT: s_mov_b32 s12, s9 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s6, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s8, 24 -; GCN-HSA-NEXT: s_lshr_b32 s30, s8, 8 -; GCN-HSA-NEXT: s_ashr_i32 s43, s9, 31 -; GCN-HSA-NEXT: s_ashr_i32 s52, s9, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[16:17], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[24:25], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6 +; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4 +; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5 +; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7 +; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24 +; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8 +; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8 +; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16 +; GCN-HSA-NEXT: s_mov_b32 s28, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) -; GCN-HSA-NEXT: v_readfirstlane_b32 s25, v5 -; GCN-HSA-NEXT: v_readfirstlane_b32 s24, v4 -; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-HSA-NEXT: v_readfirstlane_b32 s37, v3 -; GCN-HSA-NEXT: v_readfirstlane_b32 s36, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2 +; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000 +; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0 +; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1 ; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[30:31], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[10:11], 0x80000 +; GCN-HSA-NEXT: s_mov_b32 s22, s7 +; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25 +; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16 +; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24 +; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8 +; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8 +; GCN-HSA-NEXT: s_mov_b32 s4, s45 +; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000 +; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16 +; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24 +; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8 +; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16 +; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8 +; GCN-HSA-NEXT: s_mov_b32 s14, s41 +; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31 +; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31 +; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24 +; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_lshr_b32 s14, s37, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s37, 8 -; GCN-HSA-NEXT: s_mov_b32 s12, s37 -; GCN-HSA-NEXT: s_lshr_b32 s8, s36, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s36, 24 -; GCN-HSA-NEXT: s_lshr_b32 s4, s36, 8 -; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[36:37], 0x80000 -; GCN-HSA-NEXT: s_lshr_b32 s38, s25, 16 -; GCN-HSA-NEXT: s_lshr_b32 s28, s25, 8 -; GCN-HSA-NEXT: s_mov_b32 s36, s25 -; GCN-HSA-NEXT: s_lshr_b32 s48, s24, 16 -; GCN-HSA-NEXT: s_lshr_b32 s22, s24, 24 -; GCN-HSA-NEXT: s_lshr_b32 s18, s24, 8 -; GCN-HSA-NEXT: s_ashr_i32 s50, s7, 31 -; GCN-HSA-NEXT: s_ashr_i32 s51, s7, 24 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s17 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s15 -; GCN-HSA-NEXT: s_ashr_i32 s33, s37, 31 -; GCN-HSA-NEXT: s_ashr_i32 s42, s37, 24 -; GCN-HSA-NEXT: s_ashr_i32 s53, s25, 31 -; GCN-HSA-NEXT: s_ashr_i32 s54, s25, 24 -; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[24:25], 0x80000 +; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31 +; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24 +; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31 +; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24 +; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[48:49], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 ; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000 -; GCN-HSA-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s0, 0x50 -; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s49 -; GCN-HSA-NEXT: s_add_u32 s48, s0, 64 -; GCN-HSA-NEXT: s_addc_u32 s49, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s44 -; GCN-HSA-NEXT: s_add_u32 s44, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s45 -; GCN-HSA-NEXT: s_addc_u32 s45, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v28, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s50 -; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s49 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s34 -; GCN-HSA-NEXT: s_add_u32 s34, s0, 32 -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s44 -; GCN-HSA-NEXT: s_addc_u32 s35, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s52 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s43 +; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55 +; GCN-HSA-NEXT: s_add_u32 s54, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55 +; GCN-HSA-NEXT: s_add_u32 s54, s0, 16 +; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 +; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41 +; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41 +; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55 +; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 -; GCN-HSA-NEXT: s_add_u32 s20, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 -; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s21 -; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xf0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s21 -; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xe0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[20:23] -; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s20 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s20, s0, 0xd0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s54 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s37 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s29 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s21, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s20 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s21 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 +; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24 +; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25 +; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18 +; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15] +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24 +; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19 +; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 -; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xc0 +; GCN-HSA-NEXT: s_add_u32 s16, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 -; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xb0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15 -; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s15 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -8584,155 +8584,155 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s9, 16 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s9, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s9 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s8, 8 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s9, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s9, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v0 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s11, 16 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v7 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v6 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s15, v5 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s14, v4 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s11, 8 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s54, s11 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s69, s11, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s11, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s41 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s38 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s39 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s46 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 24 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s9, 8 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s11, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 8 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s10, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s11, 8 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s66, s11 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[4:5], 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s11, 31 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s40 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s10, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s13, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s55 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s13, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s13 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s13, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s13, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s60 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s12, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s12, 8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s67 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s5 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s5, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s5, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s15, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s7, 8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, s7 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s7, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s7, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s15, 8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s15 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s15, 31 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s15, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s14, 24 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s14, 8 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[62:63], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s63 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s8, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[66:67], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[64:65], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[68:69], 0x80000 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s42 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 31 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s9, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s65 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 24 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s43 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s44 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s45 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s47 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i8_to_v32i64: @@ -9887,46 +9887,97 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; ; EG-LABEL: global_zextload_v4i8_to_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1 +; EG-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: AND_INT T0.W, T7.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T0.W, T7.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T4.Y, T4.X, literal.x, PV.W, +; EG-NEXT: BFE_UINT T0.W, T7.X, literal.x, PV.W, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: AND_INT T4.X, T4.X, literal.x, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, -; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T8.Y, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV * T8.X, T4.X, ; ; CM-LABEL: global_zextload_v4i8_to_v4i16: ; CM: ; %bb.0: -; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X +; CM-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T7.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; CM-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T4.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 9: +; CM-NEXT: MOV * T0.Y, T4.X, +; CM-NEXT: MOV * T7.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 10: +; CM-NEXT: AND_INT T0.Z, T7.X, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 255(3.573311e-43), -65536(nan) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV * T4.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T0.W, T7.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV T0.Y, T5.X, ; CM-NEXT: MOV * T0.W, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_UINT * T4.Y, T4.X, literal.x, PV.W, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T0.W, T7.X, literal.y, PV.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T7.X, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT * T4.X, T4.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: LSHR T7.X, KC0[2].Y, literal.x, +; CM-NEXT: OR_INT * T8.Y, PV.Z, PV.W, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T5.X, PV.Y, +; CM-NEXT: MOV * T8.X, T4.X, %load = load <4 x i8>, ptr addrspace(1) %in %ext = zext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, ptr addrspace(1) %out @@ -10017,43 +10068,109 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, ; ; EG-LABEL: global_sextload_v4i8_to_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1 +; EG-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T4.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x, -; EG-NEXT: LSHR T0.W, T4.X, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) -; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: MOV * T7.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T7.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T8.Y, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.Y, +; EG-NEXT: MOV * T8.X, T4.X, ; ; CM-LABEL: global_sextload_v4i8_to_v4i16: ; CM: ; %bb.0: -; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T4.X +; CM-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T7.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1 +; CM-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T4.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x, -; CM-NEXT: LSHR * T0.W, T4.X, literal.x, +; CM-NEXT: MOV * T0.Y, T4.X, +; CM-NEXT: MOV * T7.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 10: +; CM-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T4.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y, -; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 65535(9.183409e-41), -65536(nan) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV * T4.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T7.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV T0.Y, T5.X, +; CM-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T7.X, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: LSHR T7.X, KC0[2].Y, literal.x, +; CM-NEXT: OR_INT * T8.Y, PV.Z, PV.W, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T5.X, PV.Y, +; CM-NEXT: MOV * T8.X, T4.X, %load = load <4 x i8>, ptr addrspace(1) %in %ext = sext <4 x i8> %load to <4 x i16> store <4 x i16> %ext, ptr addrspace(1) %out @@ -10158,52 +10275,156 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; ; EG-LABEL: global_zextload_v8i8_to_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 +; EG-NEXT: ALU 61, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T5.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: AND_INT T0.W, T11.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T0.W, T11.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T6.W, T5.Y, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), -65536(nan) +; EG-NEXT: OR_INT * T1.W, PS, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T11.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T6.Y, T5.X, literal.x, T0.W, -; EG-NEXT: AND_INT * T6.Z, T5.Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) -; EG-NEXT: AND_INT T6.X, T5.X, literal.x, -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, -; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T12.Y, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T11.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T11.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: BFE_UINT * T0.W, T11.Y, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T12.W, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T12.X, T8.X, +; EG-NEXT: MOV * T12.Z, T4.X, ; ; CM-LABEL: global_zextload_v8i8_to_v8i16: ; CM: ; %bb.0: -; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T5.X +; CM-NEXT: ALU 60, @10, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T11.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; CM-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T5.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 9: +; CM-NEXT: MOV * T0.Y, T8.X, +; CM-NEXT: MOV * T11.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 10: +; CM-NEXT: AND_INT T0.Z, T11.X, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 255(3.573311e-43), -65536(nan) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV * T8.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T0.W, T11.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T8.X, PV.W, +; CM-NEXT: MOV T0.Y, T9.X, ; CM-NEXT: MOV * T0.W, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_UINT * T6.W, T5.Y, literal.x, PV.W, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T11.X, literal.y, PV.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T9.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T11.X, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_UINT T6.Y, T5.X, literal.x, T0.W, -; CM-NEXT: AND_INT * T6.Z, T5.Y, literal.y, -; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) -; CM-NEXT: AND_INT * T6.X, T5.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T12.Y, PV.Z, PV.W, +; CM-NEXT: MOV T9.X, PV.Y, +; CM-NEXT: MOV * T0.Y, T4.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T11.Y, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T4.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T11.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV * T0.Y, T5.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T0.W, T11.Y, literal.y, T0.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T11.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: LSHR T11.X, KC0[2].Y, literal.x, +; CM-NEXT: OR_INT * T12.W, PV.Z, PV.W, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T12.X, T8.X, +; CM-NEXT: MOV * T12.Z, T4.X, BS:VEC_120/SCL_212 %load = load <8 x i8>, ptr addrspace(1) %in %ext = zext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, ptr addrspace(1) %out @@ -10309,33 +10530,34 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, 0xffff ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s5, 16 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s5 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s4 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s9, s5, 0x80000 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s4 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s10, s4, 0x80000 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 24 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s7, s7, 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s6, s6, 0x80000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s9, 0xffff, s9 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, 0xffff, s10 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s7, 0xffff, s7 ; GCN-NOHSA-VI-NEXT: s_and_b32 s6, 0xffff, s6 -; GCN-NOHSA-VI-NEXT: v_and_b32_sdwa v0, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8 +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s10, s11 ; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s7, s5 ; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s6, s4 -; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, s10, v0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 @@ -10344,53 +10566,183 @@ define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ; ; EG-LABEL: global_sextload_v8i8_to_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1 +; EG-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T5.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: MOV * T11.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.Y, literal.x, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T11.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T5.X, literal.x, +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T11.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T12.Y, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT * T6.Y, PS, 0.0, literal.y, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T11.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T11.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T12.W, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T12.X, T8.X, +; EG-NEXT: MOV * T12.Z, T4.X, ; ; CM-LABEL: global_sextload_v8i8_to_v8i16: ; CM: ; %bb.0: -; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T5.X +; CM-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T11.X ; CM-NEXT: CF_END ; CM-NEXT: PAD ; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1 +; CM-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1 ; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T5.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: BFE_INT * T6.Z, T5.Y, 0.0, literal.x, +; CM-NEXT: MOV * T0.Y, T8.X, +; CM-NEXT: MOV * T11.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 10: +; CM-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 65535(9.183409e-41), -65536(nan) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV * T8.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T11.X, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x, -; CM-NEXT: LSHR * T0.W, T5.Y, literal.x, +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T0.Z, T5.X, literal.x, -; CM-NEXT: BFE_INT * T6.W, PV.W, 0.0, literal.x, +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T8.X, PV.W, +; CM-NEXT: MOV T0.Y, T9.X, +; CM-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT * T6.Y, PV.Z, 0.0, literal.y, -; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T9.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T11.X, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T12.Y, PV.Z, PV.W, +; CM-NEXT: MOV T9.X, PV.Y, +; CM-NEXT: MOV T0.Y, T4.X, +; CM-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T4.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T11.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV T0.Y, T5.X, +; CM-NEXT: LSHR * T0.W, T11.Y, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T11.Y, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: LSHR T11.X, KC0[2].Y, literal.x, +; CM-NEXT: OR_INT * T12.W, PV.Z, PV.W, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T12.X, T8.X, +; CM-NEXT: MOV * T12.Z, T4.X, BS:VEC_120/SCL_212 %load = load <8 x i8>, ptr addrspace(1) %in %ext = sext <8 x i8> %load to <8 x i16> store <8 x i16> %ext, ptr addrspace(1) %out @@ -10547,71 +10899,287 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out ; ; EG-LABEL: global_zextload_v16i8_to_v16i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1 +; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @8 +; EG-NEXT: ALU 103, @12, KC0[], KC1[] +; EG-NEXT: ALU 20, @116, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T7.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.Y, T16.X, +; EG-NEXT: MOV * T19.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: AND_INT T0.W, T19.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T16.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T0.W, T19.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T8.W, T7.Y, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T19.X, literal.x, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), -65536(nan) +; EG-NEXT: OR_INT * T1.W, PS, PV.W, +; EG-NEXT: MOV * T17.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T19.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T8.Y, T7.X, literal.x, T0.W, -; EG-NEXT: AND_INT T8.Z, T7.Y, literal.y, -; EG-NEXT: BFE_UINT * T9.W, T7.W, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) -; EG-NEXT: AND_INT T8.X, T7.X, literal.x, -; EG-NEXT: BFE_UINT T9.Y, T7.Z, literal.y, T0.W, -; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT * T9.Z, T7.W, literal.x, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T9.X, T7.Z, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44) -; EG-NEXT: LSHR * T10.X, PV.W, literal.x, +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T20.Y, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T12.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T19.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T12.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T19.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T13.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T19.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T20.W, PV.W, PS, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T19.Z, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T19.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T19.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T19.Y, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T19.W, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T19.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 116: +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR T0.W, T19.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44) +; EG-NEXT: LSHR T21.X, PS, literal.x, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.y, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.z, +; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: 16711680(2.341805e-38), 0(0.000000e+00) +; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T19.W, PV.W, PS, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T20.X, T16.X, +; EG-NEXT: MOV * T20.Z, T12.X, +; EG-NEXT: MOV T19.X, T8.X, +; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 ; ; CM-LABEL: global_zextload_v16i8_to_v16i16: ; CM: ; %bb.0: -; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T9, T7.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T10.X +; CM-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; CM-NEXT: TEX 0 @8 +; CM-NEXT: ALU 101, @12, KC0[], KC1[] +; CM-NEXT: ALU 20, @114, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T22.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T21.X ; CM-NEXT: CF_END -; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 -; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T7.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 9: +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 8: +; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 +; CM-NEXT: ALU clause starting at 10: +; CM-NEXT: MOV * T0.Y, T16.X, +; CM-NEXT: MOV * T19.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 12: +; CM-NEXT: AND_INT T0.Z, T19.X, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 255(3.573311e-43), -65536(nan) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV * T16.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T0.W, T19.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T16.X, PV.W, +; CM-NEXT: MOV T0.Y, T17.X, ; CM-NEXT: MOV * T0.W, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_UINT * T8.W, T7.W, literal.x, PV.W, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T19.X, literal.y, PV.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T17.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T19.X, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_UINT T8.Y, T7.Z, literal.x, T0.W, -; CM-NEXT: AND_INT T8.Z, T7.W, literal.y, -; CM-NEXT: BFE_UINT * T9.W, T7.Y, literal.x, T0.W, -; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) -; CM-NEXT: AND_INT T8.X, T7.Z, literal.x, -; CM-NEXT: BFE_UINT T9.Y, T7.X, literal.y, T0.W, -; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, -; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T20.Y, PV.Z, PV.W, +; CM-NEXT: MOV T17.X, PV.Y, +; CM-NEXT: MOV * T0.Y, T12.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T19.Y, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T12.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T19.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T12.X, PV.W, +; CM-NEXT: MOV * T0.Y, T13.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T19.Y, literal.y, T0.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T13.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T19.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T20.W, PV.Z, PV.W, +; CM-NEXT: MOV T13.X, PV.W, +; CM-NEXT: MOV * T0.Y, T8.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T19.Z, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T8.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T19.Z, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T8.X, PV.W, +; CM-NEXT: MOV * T0.Y, T9.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T19.Z, literal.y, T0.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T9.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T19.Z, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T19.Y, PV.Z, PV.W, +; CM-NEXT: MOV T9.X, PV.Y, +; CM-NEXT: MOV * T0.Y, T4.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T19.W, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T4.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T19.W, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV * T0.Y, T5.X, +; CM-NEXT: AND_INT * T0.Z, PV.Y, literal.x, +; CM-NEXT: -65536(nan), 0(0.000000e+00) +; CM-NEXT: ALU clause starting at 114: +; CM-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T10.X, PV.W, literal.x, -; CM-NEXT: AND_INT * T9.Z, T7.Y, literal.y, -; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43) -; CM-NEXT: AND_INT * T9.X, T7.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x, +; CM-NEXT: OR_INT * T0.W, T0.Z, PV.W, +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T19.W, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: LSHR T21.X, KC0[2].Y, literal.x, +; CM-NEXT: AND_INT T0.Y, PV.Y, literal.y, +; CM-NEXT: AND_INT T0.Z, PV.W, literal.z, +; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.w, +; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; CM-NEXT: 16711680(2.341805e-38), 16(2.242078e-44) +; CM-NEXT: LSHR T22.X, PV.W, literal.x, +; CM-NEXT: OR_INT * T19.W, PV.Y, PV.Z, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T20.X, T16.X, +; CM-NEXT: MOV * T20.Z, T12.X, BS:VEC_120/SCL_212 +; CM-NEXT: MOV T19.X, T8.X, +; CM-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 %load = load <16 x i8>, ptr addrspace(1) %in %ext = zext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, ptr addrspace(1) %out @@ -10777,27 +11345,27 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s7, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s7, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s5 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s4 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s6 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s6, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s5, 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s4, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 24 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v2 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s4, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s5, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s5 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s5, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s7 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s6 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s4 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s4, 0x80000 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s7, 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s6, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 24 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s11, s11, 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s10, s10, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s16, s16, 8 @@ -10806,12 +11374,12 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s8, s8, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 8 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, 0xffff, s17 ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, 0xffff, s19 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff0000 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s11, 0xffff, s11 ; GCN-NOHSA-VI-NEXT: s_and_b32 s10, 0xffff, s10 ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff0000 @@ -10822,94 +11390,365 @@ define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15 ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s11, s7 -; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s10, s6 +; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s11, s5 +; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s10, s4 ; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s17, s16 ; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s19, s18 -; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s9, s5 -; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s8, s4 +; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s9, s7 +; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s8, s6 ; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s13, s12 ; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s15, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v16i8_to_v16i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1 +; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @8 +; EG-NEXT: ALU 104, @12, KC0[], KC1[] +; EG-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T7.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: BFE_INT * T8.Z, T7.Y, 0.0, literal.x, +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.Y, T16.X, +; EG-NEXT: MOV * T19.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T8.X, T7.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T9.Z, T7.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T7.Y, literal.x, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T16.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T19.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T9.X, T7.Z, 0.0, literal.x, -; EG-NEXT: LSHR T0.Z, T7.W, literal.x, -; EG-NEXT: BFE_INT T8.W, PV.W, 0.0, literal.x, -; EG-NEXT: LSHR * T0.W, T7.X, literal.x, +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T17.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T19.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T20.Y, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.Y, +; EG-NEXT: MOV T0.Y, T12.X, +; EG-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x, -; EG-NEXT: BFE_INT T8.Y, PS, 0.0, literal.y, -; EG-NEXT: LSHR T1.Z, T7.Z, literal.y, -; EG-NEXT: BFE_INT T9.W, PV.Z, 0.0, literal.y, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T12.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T10.X, PS, literal.x, -; EG-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: LSHR * T0.W, T19.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T13.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T19.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T20.W, PV.W, PS, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV T0.Y, T8.X, +; EG-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T19.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T19.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: ALU clause starting at 117: +; EG-NEXT: OR_INT * T19.Y, T1.W, T0.W, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T19.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T19.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR T0.W, T19.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44) +; EG-NEXT: LSHR T21.X, PS, literal.x, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.y, +; EG-NEXT: LSHL * T0.W, PV.W, literal.z, +; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T19.W, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T20.X, T16.X, +; EG-NEXT: MOV * T20.Z, T12.X, +; EG-NEXT: MOV T19.X, T8.X, +; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 ; ; CM-LABEL: global_sextload_v16i8_to_v16i16: ; CM: ; %bb.0: -; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; CM-NEXT: TEX 0 @6 -; CM-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T9, T7.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T10.X +; CM-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; CM-NEXT: TEX 0 @8 +; CM-NEXT: ALU 104, @12, KC0[], KC1[] +; CM-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T22.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T21.X ; CM-NEXT: CF_END -; CM-NEXT: Fetch clause starting at 6: -; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 -; CM-NEXT: ALU clause starting at 8: -; CM-NEXT: MOV * T7.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 9: -; CM-NEXT: BFE_INT * T8.Z, T7.W, 0.0, literal.x, +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 8: +; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1 +; CM-NEXT: ALU clause starting at 10: +; CM-NEXT: MOV * T0.Y, T16.X, +; CM-NEXT: MOV * T19.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 12: +; CM-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 65535(9.183409e-41), -65536(nan) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV * T16.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T19.X, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T8.X, T7.Z, 0.0, literal.x, -; CM-NEXT: LSHR T0.Y, T7.Y, literal.x, -; CM-NEXT: BFE_INT T9.Z, T7.Y, 0.0, literal.x, -; CM-NEXT: LSHR * T0.W, T7.W, literal.x, +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T9.X, T7.X, 0.0, literal.x, -; CM-NEXT: LSHR T1.Y, T7.Z, literal.x, -; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y, -; CM-NEXT: BFE_INT * T8.W, PV.W, 0.0, literal.x, -; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; CM-NEXT: LSHR T10.X, PV.Z, literal.x, -; CM-NEXT: BFE_INT T8.Y, PV.Y, 0.0, literal.y, -; CM-NEXT: LSHR T0.Z, T7.X, literal.y, -; CM-NEXT: BFE_INT * T9.W, T0.Y, 0.0, literal.y, -; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; CM-NEXT: LSHR T7.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y, -; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T16.X, PV.W, +; CM-NEXT: MOV T0.Y, T17.X, +; CM-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T17.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T19.X, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T20.Y, PV.Z, PV.W, +; CM-NEXT: MOV T17.X, PV.Y, +; CM-NEXT: MOV T0.Y, T12.X, +; CM-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T12.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T19.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T12.X, PV.W, +; CM-NEXT: MOV T0.Y, T13.X, +; CM-NEXT: LSHR * T0.W, T19.Y, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T13.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T19.Y, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T20.W, PV.Z, PV.W, +; CM-NEXT: MOV T13.X, PV.W, +; CM-NEXT: MOV T0.Y, T8.X, +; CM-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T8.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T19.Z, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T8.X, PV.W, +; CM-NEXT: MOV T0.Y, T9.X, +; CM-NEXT: LSHR * T0.W, T19.Z, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T9.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T19.Z, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: ALU clause starting at 117: +; CM-NEXT: OR_INT * T19.Y, T0.Z, T0.W, +; CM-NEXT: MOV T9.X, PV.Y, +; CM-NEXT: MOV T0.Y, T4.X, +; CM-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T4.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T19.W, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV T0.Y, T5.X, +; CM-NEXT: LSHR * T0.W, T19.W, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T19.W, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: LSHR T21.X, KC0[2].Y, literal.x, +; CM-NEXT: AND_INT T0.Y, PV.Y, literal.y, +; CM-NEXT: LSHL T0.Z, PV.W, literal.z, +; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, +; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: LSHR T22.X, PV.W, literal.x, +; CM-NEXT: OR_INT * T19.W, PV.Y, PV.Z, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T20.X, T16.X, +; CM-NEXT: MOV * T20.Z, T12.X, BS:VEC_120/SCL_212 +; CM-NEXT: MOV T19.X, T8.X, +; CM-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212 %load = load <16 x i8>, ptr addrspace(1) %in %ext = sext <16 x i8> %load to <16 x i16> store <16 x i16> %ext, ptr addrspace(1) %out @@ -11181,115 +12020,543 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out ; ; EG-LABEL: global_zextload_v32i8_to_v32i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @8 -; EG-NEXT: ALU 37, @13, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1 +; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @10 +; EG-NEXT: ALU 103, @16, KC0[], KC1[] +; EG-NEXT: ALU 104, @120, KC0[], KC1[] +; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 -; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MOV * T11.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 13: +; EG-NEXT: Fetch clause starting at 10: +; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: MOV * T0.Y, T16.X, +; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 16: +; EG-NEXT: AND_INT T0.W, T37.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 255(3.573311e-43), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T16.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T0.W, T37.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, ; EG-NEXT: MOV * T0.W, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT * T13.W, T11.Y, literal.x, PV.W, +; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), -65536(nan) +; EG-NEXT: OR_INT * T1.W, PS, PV.W, +; EG-NEXT: MOV * T17.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T37.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T36.Y, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T12.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T12.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T37.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W, -; EG-NEXT: AND_INT T13.Z, T11.Y, literal.y, -; EG-NEXT: BFE_UINT * T14.W, T11.W, literal.x, T0.W, -; EG-NEXT: 8(1.121039e-44), 255(3.573311e-43) -; EG-NEXT: AND_INT T13.X, T11.X, literal.x, -; EG-NEXT: BFE_UINT T14.Y, T11.Z, literal.y, T0.W, -; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T14.Z, T11.W, literal.x, -; EG-NEXT: BFE_UINT * T15.W, T12.Y, literal.y, T0.W, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; EG-NEXT: AND_INT T14.X, T11.Z, literal.x, -; EG-NEXT: BFE_UINT T15.Y, T12.X, literal.y, T0.W, -; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, -; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T16.X, PV.W, literal.x, -; EG-NEXT: AND_INT T15.Z, T12.Y, literal.y, -; EG-NEXT: BFE_UINT T17.W, T12.W, literal.z, T0.W, -; EG-NEXT: AND_INT * T15.X, T12.X, literal.y, -; EG-NEXT: 2(2.802597e-45), 255(3.573311e-43) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T13.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T37.Y, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_UINT T17.Y, T12.Z, literal.x, T0.W, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T12.X, PV.W, literal.x, -; EG-NEXT: AND_INT T17.Z, T12.W, literal.y, -; EG-NEXT: AND_INT * T17.X, T12.Z, literal.y, -; EG-NEXT: 2(2.802597e-45), 255(3.573311e-43) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T36.W, PV.W, PS, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV * T0.Y, T8.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T37.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T37.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T37.Y, PV.W, PS, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T37.W, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T37.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 120: +; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T37.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T37.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T0.Y, T32.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T35.X, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T32.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T35.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T32.X, PV.W, +; EG-NEXT: MOV T0.Y, T33.X, +; EG-NEXT: BFE_UINT * T1.W, T35.X, literal.x, T0.W, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T33.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T35.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T38.Y, PV.W, PS, +; EG-NEXT: MOV T33.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T28.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T35.Y, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T28.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T35.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T28.X, PV.W, +; EG-NEXT: MOV T0.Y, T29.X, +; EG-NEXT: BFE_UINT * T1.W, T35.Y, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T29.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T35.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T38.W, PV.W, PS, +; EG-NEXT: MOV T29.X, PV.W, +; EG-NEXT: MOV * T0.Y, T24.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T35.Z, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T24.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHL * T1.W, T35.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T24.X, PV.W, +; EG-NEXT: MOV T0.Y, T25.X, +; EG-NEXT: BFE_UINT * T1.W, T35.Z, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, PV.W, T1.W, +; EG-NEXT: MOV * T25.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T1.W, T35.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T35.Y, PV.W, PS, +; EG-NEXT: MOV T25.X, PV.Y, +; EG-NEXT: MOV * T0.Y, T20.X, +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T2.W, T35.W, literal.y, +; EG-NEXT: -65536(nan), 255(3.573311e-43) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV * T20.X, PV.W, +; EG-NEXT: ALU clause starting at 225: +; EG-NEXT: MOV T0.Y, T20.X, +; EG-NEXT: LSHL * T1.W, T35.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; EG-NEXT: OR_INT * T1.W, PV.W, PS, +; EG-NEXT: MOV T20.X, PV.W, +; EG-NEXT: MOV T0.Y, T21.X, +; EG-NEXT: BFE_UINT * T0.W, T35.W, literal.x, T0.W, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, T0.W, +; EG-NEXT: MOV * T21.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) -; EG-NEXT: LSHR * T18.X, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T39.X, PV.W, literal.x, +; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LSHR T0.W, T35.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44) +; EG-NEXT: LSHR T41.X, PS, literal.x, +; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y, +; EG-NEXT: AND_INT T0.W, PV.W, literal.z, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, +; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: 16711680(2.341805e-38), 32(4.484155e-44) +; EG-NEXT: LSHR T42.X, PS, literal.x, +; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T21.X, PV.W, +; EG-NEXT: MOV * T36.X, T16.X, +; EG-NEXT: MOV * T36.Z, T12.X, +; EG-NEXT: MOV T37.X, T8.X, +; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T38.X, T32.X, +; EG-NEXT: MOV * T38.Z, T28.X, +; EG-NEXT: MOV T35.X, T24.X, +; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 ; ; CM-LABEL: global_zextload_v32i8_to_v32i16: ; CM: ; %bb.0: -; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; CM-NEXT: TEX 1 @8 -; CM-NEXT: ALU 39, @13, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T17, T12.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T18.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T16.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T15.X +; CM-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[] +; CM-NEXT: TEX 1 @10 +; CM-NEXT: ALU 101, @16, KC0[], KC1[] +; CM-NEXT: ALU 101, @118, KC0[], KC1[] +; CM-NEXT: ALU 40, @220, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T42.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T38, T41.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T40.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T39.X ; CM-NEXT: CF_END -; CM-NEXT: Fetch clause starting at 8: -; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1 -; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1 -; CM-NEXT: ALU clause starting at 12: -; CM-NEXT: MOV * T11.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 13: +; CM-NEXT: Fetch clause starting at 10: +; CM-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 +; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1 +; CM-NEXT: ALU clause starting at 14: +; CM-NEXT: MOV * T0.Y, T16.X, +; CM-NEXT: MOV * T35.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 16: +; CM-NEXT: AND_INT T0.Z, T37.X, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 255(3.573311e-43), -65536(nan) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV * T16.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T0.W, T37.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T16.X, PV.W, +; CM-NEXT: MOV T0.Y, T17.X, ; CM-NEXT: MOV * T0.W, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_UINT * T13.W, T11.W, literal.x, PV.W, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T37.X, literal.y, PV.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T17.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T37.X, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_UINT T13.Y, T11.Z, literal.x, T0.W, -; CM-NEXT: AND_INT T13.Z, T11.W, literal.y, -; CM-NEXT: BFE_UINT * T14.W, T11.Y, literal.x, T0.W, -; CM-NEXT: 8(1.121039e-44), 255(3.573311e-43) -; CM-NEXT: AND_INT T13.X, T11.Z, literal.x, -; CM-NEXT: BFE_UINT T14.Y, T11.X, literal.y, T0.W, -; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, -; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T15.X, PV.W, literal.x, -; CM-NEXT: AND_INT T14.Z, T11.Y, literal.y, -; CM-NEXT: BFE_UINT * T11.W, T12.W, literal.z, T0.W, -; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T35.Y, PV.Z, PV.W, +; CM-NEXT: MOV T17.X, PV.Y, +; CM-NEXT: MOV * T0.Y, T12.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T37.Y, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T12.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T37.Y, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT T14.X, T11.X, literal.x, -; CM-NEXT: BFE_UINT T11.Y, T12.Z, literal.y, T0.W, -; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z, -; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44) -; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T16.X, PV.W, literal.x, -; CM-NEXT: AND_INT T11.Z, T12.W, literal.y, -; CM-NEXT: BFE_UINT * T17.W, T12.Y, literal.z, T0.W, -; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T12.X, PV.W, +; CM-NEXT: MOV * T0.Y, T13.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T37.Y, literal.y, T0.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T13.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T37.Y, literal.x, ; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: AND_INT T11.X, T12.Z, literal.x, -; CM-NEXT: BFE_UINT T17.Y, T12.X, literal.y, T0.W, -; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, -; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T35.W, PV.Z, PV.W, +; CM-NEXT: MOV T13.X, PV.W, +; CM-NEXT: MOV * T0.Y, T8.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T37.Z, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T8.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T37.Z, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T8.X, PV.W, +; CM-NEXT: MOV * T0.Y, T9.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T37.Z, literal.y, T0.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T9.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T37.Z, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T37.Y, PV.Z, PV.W, +; CM-NEXT: MOV T9.X, PV.Y, +; CM-NEXT: MOV * T0.Y, T4.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T37.W, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T4.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T37.W, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV * T0.Y, T5.X, +; CM-NEXT: AND_INT * T0.Z, PV.Y, literal.x, +; CM-NEXT: -65536(nan), 0(0.000000e+00) +; CM-NEXT: ALU clause starting at 118: +; CM-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W, ; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T18.X, PV.W, literal.x, -; CM-NEXT: AND_INT * T17.Z, T12.Y, literal.y, -; CM-NEXT: 2(2.802597e-45), 255(3.573311e-43) -; CM-NEXT: AND_INT * T17.X, T12.X, literal.x, -; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; CM-NEXT: LSHR * T12.X, KC0[2].Y, literal.x, +; CM-NEXT: OR_INT * T1.W, T0.Z, PV.W, +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T37.W, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T37.W, PV.Z, PV.W, +; CM-NEXT: MOV T5.X, PV.W, +; CM-NEXT: MOV * T0.Y, T32.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T36.X, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T32.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T36.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T32.X, PV.W, +; CM-NEXT: MOV * T0.Y, T33.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T36.X, literal.y, T0.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T33.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T36.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T38.Y, PV.Z, PV.W, +; CM-NEXT: MOV T33.X, PV.Y, +; CM-NEXT: MOV * T0.Y, T28.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T36.Y, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T28.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T36.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T28.X, PV.W, +; CM-NEXT: MOV * T0.Y, T29.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T36.Y, literal.y, T0.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T29.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T36.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T38.W, PV.Z, PV.W, +; CM-NEXT: MOV T29.X, PV.W, +; CM-NEXT: MOV * T0.Y, T24.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T36.Z, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T24.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHL * T1.W, T36.Z, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T24.X, PV.W, +; CM-NEXT: MOV * T0.Y, T25.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T1.W, T36.Z, literal.y, T0.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T25.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T1.W, T36.Z, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T36.Y, PV.Z, PV.W, +; CM-NEXT: MOV T25.X, PV.Y, +; CM-NEXT: MOV * T0.Y, T20.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, T36.W, literal.y, +; CM-NEXT: -65536(nan), 255(3.573311e-43) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV * T20.X, PV.W, +; CM-NEXT: ALU clause starting at 220: +; CM-NEXT: MOV T0.Y, T20.X, +; CM-NEXT: LSHL * T1.W, T36.W, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T1.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38) +; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W, +; CM-NEXT: MOV T20.X, PV.W, +; CM-NEXT: MOV * T0.Y, T21.X, +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: BFE_UINT * T0.W, T36.W, literal.y, T0.W, +; CM-NEXT: -65536(nan), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T21.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; CM-NEXT: LSHR T39.X, PV.W, literal.x, +; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44) +; CM-NEXT: LSHR T40.X, PV.W, literal.x, +; CM-NEXT: LSHR * T0.W, T36.W, literal.y, +; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; CM-NEXT: LSHR T41.X, KC0[2].Y, literal.x, +; CM-NEXT: AND_INT T0.Y, T0.Y, literal.y, +; CM-NEXT: AND_INT T0.Z, PV.W, literal.z, +; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.w, +; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; CM-NEXT: 16711680(2.341805e-38), 16(2.242078e-44) +; CM-NEXT: LSHR T42.X, PV.W, literal.x, +; CM-NEXT: OR_INT * T36.W, PV.Y, PV.Z, ; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T21.X, PV.W, +; CM-NEXT: MOV T35.X, T16.X, +; CM-NEXT: MOV * T35.Z, T12.X, BS:VEC_120/SCL_212 +; CM-NEXT: MOV T37.X, T8.X, +; CM-NEXT: MOV * T37.Z, T4.X, BS:VEC_120/SCL_212 +; CM-NEXT: MOV T38.X, T32.X, +; CM-NEXT: MOV * T38.Z, T28.X, BS:VEC_120/SCL_212 +; CM-NEXT: MOV T36.X, T24.X, +; CM-NEXT: MOV * T36.Z, T20.X, BS:VEC_120/SCL_212 %load = load <32 x i8>, ptr addrspace(1) %in %ext = zext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, ptr addrspace(1) %out @@ -11577,8 +12844,8 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16 -; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1) @@ -11586,38 +12853,39 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s6, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s7 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s29, s7, 0x80000 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s6 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s31, s6, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 16 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s14, s14, 0x80000 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s7 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s29, s7, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 24 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s15, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, 0xffff, s14 ; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0) -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v4 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v5 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15 ; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s14, s6 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s5 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s11 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s10 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s9 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s8 ; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s15, s7 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s5, 0x80000 ; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s21, s11, 0x80000 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s23, s10, 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s21, s9, 0x80000 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s23, s8, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s20, s20, 8 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s22, s22, 8 ; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6 -; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v6 +; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v7 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s4, 16 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s21, 0xffff, s21 @@ -11627,11 +12895,11 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s15, s14 ; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s4 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s8, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s9 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s25, s9, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s11, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s11 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s25, s11, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s11, s11, 16 ; GCN-NOHSA-VI-NEXT: s_or_b32 s20, s21, s20 ; GCN-NOHSA-VI-NEXT: s_or_b32 s21, s23, s22 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 8 @@ -11640,13 +12908,12 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s13, 0x80000 ; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s12, s12, 0x80000 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s10, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s8 -; GCN-NOHSA-VI-NEXT: s_bfe_i32 s27, s8, 0x80000 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s8, 24 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s9, 16 +; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s10 +; GCN-NOHSA-VI-NEXT: s_bfe_i32 s27, s10, 0x80000 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s10, s10, 24 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s17, 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s16, s16, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s28, s28, 8 @@ -11657,16 +12924,16 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_and_b32 s13, 0xffff, s13 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s12, 0xffff, s12 -; GCN-NOHSA-VI-NEXT: s_ashr_i32 s10, s10, 24 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s9, 24 +; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s8, 24 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s19, 0x80000 ; GCN-NOHSA-VI-NEXT: s_bfe_i32 s18, s18, 0x80000 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s24, s24, 8 ; GCN-NOHSA-VI-NEXT: s_lshl_b32 s26, s26, 8 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s29, 0xffff, s29 ; GCN-NOHSA-VI-NEXT: s_and_b32 s31, 0xffff, s31 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff0000 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s17, 0xffff, s17 ; GCN-NOHSA-VI-NEXT: s_and_b32 s16, 0xffff, s16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff0000 @@ -11674,161 +12941,702 @@ define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out ; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s22, s15 ; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s13, s5 ; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s12, s4 -; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16 ; GCN-NOHSA-VI-NEXT: s_and_b32 s25, 0xffff, s25 ; GCN-NOHSA-VI-NEXT: s_and_b32 s27, 0xffff, s27 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s19, 0xffff, s19 ; GCN-NOHSA-VI-NEXT: s_and_b32 s18, 0xffff, s18 ; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff0000 ; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff0000 -; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s17, s9 -; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s16, s8 +; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s17, s11 +; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s16, s10 ; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s29, s28 ; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s31, s30 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s19, s11 -; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s18, s10 +; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s19, s9 +; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s18, s8 ; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s25, s24 ; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s27, s26 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: global_sextload_v32i8_to_v32i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @8 -; EG-NEXT: ALU 39, @13, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T13.X, 1 +; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @10 +; EG-NEXT: ALU 104, @16, KC0[], KC1[] +; EG-NEXT: ALU 104, @121, KC0[], KC1[] +; EG-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1 ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 -; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MOV * T11.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 13: -; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) -; EG-NEXT: LSHR T14.X, PV.W, literal.x, -; EG-NEXT: BFE_INT * T15.Z, T11.Y, 0.0, literal.y, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; EG-NEXT: BFE_INT T15.X, T11.X, 0.0, literal.x, -; EG-NEXT: LSHR T0.Y, T12.W, literal.x, -; EG-NEXT: BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: LSHR T0.W, T12.Y, literal.x, -; EG-NEXT: LSHR * T1.W, T11.Y, literal.x, +; EG-NEXT: Fetch clause starting at 10: +; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 +; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: MOV * T0.Y, T16.X, +; EG-NEXT: MOV * T35.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 16: +; EG-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T16.X, T11.Z, 0.0, literal.x, -; EG-NEXT: LSHR T1.Y, T11.W, literal.x, -; EG-NEXT: BFE_INT T17.Z, T12.Y, 0.0, literal.x, -; EG-NEXT: BFE_INT T15.W, PS, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T11.X, literal.x, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T16.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T37.X, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T17.X, T12.X, 0.0, literal.x, -; EG-NEXT: BFE_INT T15.Y, PS, 0.0, literal.x, -; EG-NEXT: BFE_INT T18.Z, T12.W, 0.0, literal.x, -; EG-NEXT: BFE_INT T16.W, PV.Y, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T11.Z, literal.x, +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T16.X, PV.W, +; EG-NEXT: MOV T0.Y, T17.X, +; EG-NEXT: LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T17.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T37.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T36.Y, PV.W, PS, +; EG-NEXT: MOV T17.X, PV.Y, +; EG-NEXT: MOV T0.Y, T12.X, +; EG-NEXT: BFE_INT * T0.W, T37.Y, 0.0, literal.x, ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; EG-NEXT: BFE_INT T18.X, T12.Z, 0.0, literal.x, -; EG-NEXT: BFE_INT T16.Y, PS, 0.0, literal.x, -; EG-NEXT: LSHR T0.Z, T12.X, literal.x, -; EG-NEXT: BFE_INT T17.W, T0.W, 0.0, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44) -; EG-NEXT: LSHR T11.X, PS, literal.x, -; EG-NEXT: BFE_INT T17.Y, PV.Z, 0.0, literal.y, -; EG-NEXT: LSHR T0.Z, T12.Z, literal.y, -; EG-NEXT: BFE_INT T18.W, T0.Y, 0.0, literal.y, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T12.X, PS, literal.x, -; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y, -; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T12.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T37.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T12.X, PV.W, +; EG-NEXT: MOV T0.Y, T13.X, +; EG-NEXT: LSHR * T0.W, T37.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T13.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T37.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T36.W, PV.W, PS, +; EG-NEXT: MOV T13.X, PV.W, +; EG-NEXT: MOV T0.Y, T8.X, +; EG-NEXT: BFE_INT * T0.W, T37.Z, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T8.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T37.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T8.X, PV.W, +; EG-NEXT: MOV T0.Y, T9.X, +; EG-NEXT: LSHR * T0.W, T37.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T9.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T37.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: ALU clause starting at 121: +; EG-NEXT: OR_INT * T37.Y, T1.W, T0.W, +; EG-NEXT: MOV T9.X, PV.Y, +; EG-NEXT: MOV T0.Y, T4.X, +; EG-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T37.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T4.X, PV.W, +; EG-NEXT: MOV T0.Y, T5.X, +; EG-NEXT: LSHR * T0.W, T37.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T37.W, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T37.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV T0.Y, T32.X, +; EG-NEXT: BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T32.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T35.X, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T32.X, PV.W, +; EG-NEXT: MOV T0.Y, T33.X, +; EG-NEXT: LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T33.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T35.X, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T38.Y, PV.W, PS, +; EG-NEXT: MOV T33.X, PV.Y, +; EG-NEXT: MOV T0.Y, T28.X, +; EG-NEXT: BFE_INT * T0.W, T35.Y, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T28.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T35.Y, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T28.X, PV.W, +; EG-NEXT: MOV T0.Y, T29.X, +; EG-NEXT: LSHR * T0.W, T35.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T29.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T35.Y, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: ALU clause starting at 226: +; EG-NEXT: AND_INT T1.W, T0.Y, literal.x, +; EG-NEXT: LSHL * T0.W, T0.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T38.W, PV.W, PS, +; EG-NEXT: MOV T29.X, PV.W, +; EG-NEXT: MOV T0.Y, T24.X, +; EG-NEXT: BFE_INT * T0.W, T35.Z, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T24.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T35.Z, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T24.X, PV.W, +; EG-NEXT: MOV T0.Y, T25.X, +; EG-NEXT: LSHR * T0.W, T35.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T25.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ASHR * T0.W, T35.Z, literal.x, +; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: LSHL * T0.W, PV.W, literal.y, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: OR_INT * T35.Y, PV.W, PS, +; EG-NEXT: MOV T25.X, PV.Y, +; EG-NEXT: MOV T0.Y, T20.X, +; EG-NEXT: BFE_INT * T0.W, T35.W, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PV.Y, literal.x, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.y, +; EG-NEXT: -65536(nan), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV * T20.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T35.W, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T20.X, PV.W, +; EG-NEXT: MOV T0.Y, T21.X, +; EG-NEXT: LSHR * T0.W, T35.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y, +; EG-NEXT: 8(1.121039e-44), -65536(nan) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T21.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T39.X, PV.W, literal.x, +; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ASHR T0.W, T35.W, literal.x, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y, +; EG-NEXT: 24(3.363116e-44), 48(6.726233e-44) +; EG-NEXT: LSHR T41.X, PS, literal.x, +; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y, +; EG-NEXT: LSHL T0.W, PV.W, literal.z, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w, +; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44) +; EG-NEXT: LSHR T42.X, PS, literal.x, +; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T21.X, PV.W, +; EG-NEXT: MOV * T36.X, T16.X, +; EG-NEXT: MOV * T36.Z, T12.X, +; EG-NEXT: MOV T37.X, T8.X, +; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212 +; EG-NEXT: MOV * T38.X, T32.X, +; EG-NEXT: MOV * T38.Z, T28.X, +; EG-NEXT: MOV T35.X, T24.X, +; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 ; ; CM-LABEL: global_sextload_v32i8_to_v32i16: ; CM: ; %bb.0: -; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[] -; CM-NEXT: TEX 1 @8 -; CM-NEXT: ALU 40, @13, KC0[CB0:0-32], KC1[] -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T17, T11.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T18.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T14.X -; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T13.X +; CM-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[] +; CM-NEXT: TEX 1 @10 +; CM-NEXT: ALU 104, @16, KC0[], KC1[] +; CM-NEXT: ALU 104, @121, KC0[], KC1[] +; CM-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T42.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T38, T41.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T40.X +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T39.X ; CM-NEXT: CF_END -; CM-NEXT: Fetch clause starting at 8: -; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1 -; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1 -; CM-NEXT: ALU clause starting at 12: -; CM-NEXT: MOV * T11.X, KC0[2].Z, -; CM-NEXT: ALU clause starting at 13: +; CM-NEXT: Fetch clause starting at 10: +; CM-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1 +; CM-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1 +; CM-NEXT: ALU clause starting at 14: +; CM-NEXT: MOV * T0.Y, T16.X, +; CM-NEXT: MOV * T35.X, KC0[2].Z, +; CM-NEXT: ALU clause starting at 16: +; CM-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.W, literal.x, +; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y, +; CM-NEXT: 65535(9.183409e-41), -65536(nan) +; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z, +; CM-NEXT: MOV * T16.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T37.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T16.X, PV.W, +; CM-NEXT: MOV T0.Y, T17.X, +; CM-NEXT: LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T17.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T37.X, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T36.Y, PV.Z, PV.W, +; CM-NEXT: MOV T17.X, PV.Y, +; CM-NEXT: MOV T0.Y, T12.X, +; CM-NEXT: BFE_INT * T0.W, T37.Y, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T12.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T37.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T12.X, PV.W, +; CM-NEXT: MOV T0.Y, T13.X, +; CM-NEXT: LSHR * T0.W, T37.Y, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T13.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T37.Y, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T36.W, PV.Z, PV.W, +; CM-NEXT: MOV T13.X, PV.W, +; CM-NEXT: MOV T0.Y, T8.X, +; CM-NEXT: BFE_INT * T0.W, T37.Z, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T8.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T37.Z, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T8.X, PV.W, +; CM-NEXT: MOV T0.Y, T9.X, +; CM-NEXT: LSHR * T0.W, T37.Z, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T9.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T37.Z, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: ALU clause starting at 121: +; CM-NEXT: OR_INT * T37.Y, T0.Z, T0.W, +; CM-NEXT: MOV T9.X, PV.Y, +; CM-NEXT: MOV T0.Y, T4.X, +; CM-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T4.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T37.W, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T4.X, PV.W, +; CM-NEXT: MOV T0.Y, T5.X, +; CM-NEXT: LSHR * T0.W, T37.W, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T5.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T37.W, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T37.W, PV.Z, PV.W, +; CM-NEXT: MOV T5.X, PV.W, +; CM-NEXT: MOV T0.Y, T32.X, +; CM-NEXT: BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T32.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T35.X, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T32.X, PV.W, +; CM-NEXT: MOV T0.Y, T33.X, +; CM-NEXT: LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212 +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T33.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T35.X, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T38.Y, PV.Z, PV.W, +; CM-NEXT: MOV T33.X, PV.Y, +; CM-NEXT: MOV T0.Y, T28.X, +; CM-NEXT: BFE_INT * T0.W, T35.Y, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T28.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T35.Y, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T28.X, PV.W, +; CM-NEXT: MOV T0.Y, T29.X, +; CM-NEXT: LSHR * T0.W, T35.Y, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T29.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T35.Y, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: ALU clause starting at 226: +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, T0.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T38.W, PV.Z, PV.W, +; CM-NEXT: MOV T29.X, PV.W, +; CM-NEXT: MOV T0.Y, T24.X, +; CM-NEXT: BFE_INT * T0.W, T35.Z, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T24.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T35.Z, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T24.X, PV.W, +; CM-NEXT: MOV T0.Y, T25.X, +; CM-NEXT: LSHR * T0.W, T35.Z, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T25.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: ASHR * T0.W, T35.Z, literal.x, +; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T35.Y, PV.Z, PV.W, +; CM-NEXT: MOV T25.X, PV.Y, +; CM-NEXT: MOV T0.Y, T20.X, +; CM-NEXT: BFE_INT * T0.W, T35.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T20.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, +; CM-NEXT: LSHR * T0.W, T35.W, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: LSHL * T0.W, PV.W, literal.y, +; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV T20.X, PV.W, +; CM-NEXT: MOV T0.Y, T21.X, +; CM-NEXT: LSHR * T0.W, T35.W, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x, +; CM-NEXT: AND_INT * T0.W, PV.W, literal.y, +; CM-NEXT: -65536(nan), 65535(9.183409e-41) +; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W, +; CM-NEXT: MOV * T21.X, PV.W, +; CM-NEXT: MOV T0.Y, PV.X, ; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T13.X, PV.W, literal.x, -; CM-NEXT: LSHR T0.Y, T11.Y, literal.y, -; CM-NEXT: LSHR T0.Z, T11.Z, literal.y, -; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, -; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) ; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) -; CM-NEXT: LSHR T14.X, PV.W, literal.x, -; CM-NEXT: LSHR T1.Y, T11.W, literal.y, -; CM-NEXT: BFE_INT T15.Z, T12.W, 0.0, literal.y, BS:VEC_120/SCL_212 -; CM-NEXT: LSHR * T0.W, T12.X, literal.y, -; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; CM-NEXT: BFE_INT T15.X, T12.Z, 0.0, literal.x, -; CM-NEXT: LSHR T2.Y, T12.Y, literal.x, -; CM-NEXT: BFE_INT T16.Z, T12.Y, 0.0, literal.x, -; CM-NEXT: LSHR * T1.W, T12.W, literal.x, -; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T16.X, T12.X, 0.0, literal.x, -; CM-NEXT: LSHR T3.Y, T12.Z, literal.x, -; CM-NEXT: BFE_INT T12.Z, T11.W, 0.0, literal.x, -; CM-NEXT: BFE_INT * T15.W, PV.W, 0.0, literal.x, -; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T12.X, T11.Z, 0.0, literal.x, -; CM-NEXT: BFE_INT T15.Y, PV.Y, 0.0, literal.x, -; CM-NEXT: BFE_INT T17.Z, T11.Y, 0.0, literal.x, -; CM-NEXT: BFE_INT * T16.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212 -; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) -; CM-NEXT: BFE_INT T17.X, T11.X, 0.0, literal.x, -; CM-NEXT: BFE_INT T16.Y, T0.W, 0.0, literal.x, -; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y, -; CM-NEXT: BFE_INT * T12.W, T1.Y, 0.0, literal.x, -; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44) -; CM-NEXT: LSHR T18.X, PV.Z, literal.x, -; CM-NEXT: BFE_INT T12.Y, T0.Z, 0.0, literal.y, -; CM-NEXT: LSHR T0.Z, T11.X, literal.y, -; CM-NEXT: BFE_INT * T17.W, T0.Y, 0.0, literal.y, -; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) -; CM-NEXT: LSHR T11.X, KC0[2].Y, literal.x, -; CM-NEXT: BFE_INT * T17.Y, PV.Z, 0.0, literal.y, -; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44) +; CM-NEXT: LSHR T39.X, PV.W, literal.x, +; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44) +; CM-NEXT: LSHR T40.X, PV.W, literal.x, +; CM-NEXT: ASHR * T0.W, T35.W, literal.y, +; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44) +; CM-NEXT: LSHR T41.X, KC0[2].Y, literal.x, +; CM-NEXT: AND_INT T0.Y, T0.Y, literal.y, +; CM-NEXT: LSHL T0.Z, PV.W, literal.z, +; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z, +; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41) +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: LSHR T42.X, PV.W, literal.x, +; CM-NEXT: OR_INT * T35.W, PV.Y, PV.Z, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T21.X, PV.W, +; CM-NEXT: MOV T36.X, T16.X, +; CM-NEXT: MOV * T36.Z, T12.X, BS:VEC_120/SCL_212 +; CM-NEXT: MOV T37.X, T8.X, +; CM-NEXT: MOV * T37.Z, T4.X, BS:VEC_120/SCL_212 +; CM-NEXT: MOV T38.X, T32.X, +; CM-NEXT: MOV * T38.Z, T28.X, BS:VEC_120/SCL_212 +; CM-NEXT: MOV T35.X, T24.X, +; CM-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212 %load = load <32 x i8>, ptr addrspace(1) %in %ext = sext <32 x i8> %load to <32 x i16> store <32 x i16> %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index 8dcecfe..ddd1ce66 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -151,19 +151,27 @@ define amdgpu_kernel void @local_load_v3i16(ptr addrspace(3) %out, ptr addrspace ; ; EG-LABEL: local_load_v3i16: ; EG: ; %bb.0: ; %entry -; EG-NEXT: ALU 11, @2, KC0[CB0:0-32], KC1[] -; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, -; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) -; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W -; EG-NEXT: MOV T0.X, OQAP, +; EG-NEXT: ALU 19, @2, KC0[CB0:0-32], KC1[] ; EG-NEXT: MOV * T0.W, KC0[2].Z, ; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W ; EG-NEXT: MOV T0.Y, OQAP, -; EG-NEXT: MOV * T0.W, KC0[2].Y, -; EG-NEXT: LDS_WRITE * T0.W, T0.Y, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W +; EG-NEXT: MOV * T0.Z, OQAP, +; EG-NEXT: LSHL T0.Z, PV.Z, literal.x, +; EG-NEXT: AND_INT T0.W, T0.Y, literal.y, +; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.z, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T1.W +; EG-NEXT: MOV T0.Y, OQAP, +; EG-NEXT: OR_INT T0.W, T0.Z, T0.W, +; EG-NEXT: MOV * T1.W, KC0[2].Y, +; EG-NEXT: LDS_WRITE * T1.W, T0.W, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) -; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.X, +; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.Y, ; EG-NEXT: RETURN entry: %ld = load <3 x i16>, ptr addrspace(3) %in @@ -1075,12 +1083,12 @@ define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out, ; SI-NEXT: s_mov_b32 m0, -1 ; SI-NEXT: ds_read_b64 v[0:1], v0 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 -; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v0 -; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 -; SI-NEXT: v_bfe_i32 v3, v0, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 +; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v1 +; SI-NEXT: v_bfe_i32 v2, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v4, v1, 0, 16 ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset1:1 +; SI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; SI-NEXT: s_endpgm ; ; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32: @@ -6145,11 +6153,11 @@ define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ; SI-NEXT: v_bfe_i32 v8, v1, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v10, v9, 0, 16 -; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 -; SI-NEXT: v_bfe_i32 v14, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: v_bfe_i32 v12, v12, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; SI-NEXT: v_bfe_i32 v14, v11, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 @@ -6811,10 +6819,10 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: v_mov_b32_e32 v18, s0 ; SI-NEXT: s_waitcnt lgkmcnt(1) ; SI-NEXT: v_mov_b32_e32 v12, v3 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v7 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, v7 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5 @@ -6837,24 +6845,24 @@ define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v3, v5, 0, 16 ; SI-NEXT: v_bfe_i32 v5, v6, 0, 16 -; SI-NEXT: v_bfe_i32 v7, v0, 0, 16 -; SI-NEXT: v_bfe_i32 v10, v2, 0, 16 +; SI-NEXT: v_bfe_i32 v10, v0, 0, 16 +; SI-NEXT: v_bfe_i32 v7, v2, 0, 16 ; SI-NEXT: v_bfe_i32 v12, v19, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: v_bfe_i32 v14, v17, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_bfe_i32 v16, v16, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3 ; SI-NEXT: v_bfe_i32 v3, v15, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7 -; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: ds_write2_b64 v18, v[10:11], v[3:4] offset0:12 offset1:13 -; SI-NEXT: ds_write2_b64 v18, v[7:8], v[16:17] offset0:8 offset1:9 +; SI-NEXT: ds_write2_b64 v18, v[7:8], v[3:4] offset0:12 offset1:13 +; SI-NEXT: ds_write2_b64 v18, v[10:11], v[16:17] offset0:8 offset1:9 ; SI-NEXT: ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5 ; SI-NEXT: ds_write2_b64 v18, v[1:2], v[12:13] offset1:1 ; SI-NEXT: s_endpgm @@ -8106,16 +8114,16 @@ define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out ; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5 ; SI-NEXT: v_bfe_i32 v11, v6, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SI-NEXT: v_bfe_i32 v13, v4, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; SI-NEXT: v_bfe_i32 v15, v15, 0, 16 +; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_bfe_i32 v16, v14, 0, 16 ; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9 ; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; SI-NEXT: ds_write2_b64 v7, v[9:10], v[16:17] offset1:1 ; SI-NEXT: v_bfe_i32 v17, v18, 0, 16 -; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1 -; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8 ; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11 ; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index a3ebaec..5f0ca7b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -74,7 +74,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x5000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_addk_i32 s0, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 @@ -175,7 +176,9 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_add_i32 s0, s33, 0x5000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x2000 +; FLATSCR-NEXT: s_add_i32 s1, s33, s0 +; FLATSCR-NEXT: s_add_i32 s0, s1, 0x3000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000 @@ -223,30 +226,35 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 ; MUBUF-NEXT: ; %bb.2: ; %split +; MUBUF-NEXT: s_movk_i32 s5, 0x12d4 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 -; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d4, v1 +; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1 +; MUBUF-NEXT: s_movk_i32 s5, 0x12d0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 ; MUBUF-NEXT: s_movk_i32 s4, 0x4000 ; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v0, 0x12d0, v1 +; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1 +; MUBUF-NEXT: s_movk_i32 s5, 0x12c4 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 ; MUBUF-NEXT: s_or_b32 s4, s4, 0x12c0 ; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c4, v1 -; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000 +; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1 ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v0, s4 -; MUBUF-NEXT: v_or_b32_e32 v2, 0x12cc, v3 +; MUBUF-NEXT: s_movk_i32 s4, 0x12cc +; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000 +; MUBUF-NEXT: v_or_b32_e32 v2, s4, v3 +; MUBUF-NEXT: s_movk_i32 s4, 0x12c8 ; MUBUF-NEXT: v_mov_b32_e32 v6, 0x4000 ; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v7, 0x4000 ; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c8, v6 +; MUBUF-NEXT: v_or_b32_e32 v2, s4, v6 ; MUBUF-NEXT: v_mov_b32_e32 v8, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v9, 0x4000 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen glc @@ -298,7 +306,8 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1 ; FLATSCR-NEXT: ; %bb.2: ; %split -; FLATSCR-NEXT: s_movk_i32 s0, 0x3000 +; FLATSCR-NEXT: s_movk_i32 s0, 0x1000 +; FLATSCR-NEXT: s_addk_i32 s0, 0x2000 ; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:704 glc diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 88c619e..1ae3434 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -372,9 +372,8 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half % ; SDAG-GFX11-TRUE16: ; %bb.0: ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v0 ; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt: diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 721f974..311527d 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -991,30 +991,81 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 { ; EG-LABEL: s_test_imin_sle_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @14, KC0[], KC1[] -; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 9, @15, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: ALU 1, @28, KC0[], KC1[] +; EG-NEXT: TEX 1 @12 +; EG-NEXT: ALU 9, @30, KC0[], KC1[] +; EG-NEXT: TEX 1 @16 +; EG-NEXT: ALU 10, @40, KC0[], KC1[] +; EG-NEXT: TEX 1 @20 +; EG-NEXT: ALU 10, @51, KC0[], KC1[] +; EG-NEXT: TEX 1 @24 +; EG-NEXT: ALU 11, @62, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T1.X, T0.X, 46, #3 -; EG-NEXT: VTX_READ_16 T2.X, T0.X, 52, #3 -; EG-NEXT: VTX_READ_16 T3.X, T0.X, 44, #3 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 54, #3 -; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: MOV * T0.X, 0.0, -; EG-NEXT: ALU clause starting at 15: -; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x, -; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: Fetch clause starting at 12: +; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3 +; EG-NEXT: VTX_READ_16 T7.X, T5.X, 58, #3 +; EG-NEXT: Fetch clause starting at 16: +; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3 +; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3 +; EG-NEXT: Fetch clause starting at 20: +; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3 +; EG-NEXT: VTX_READ_16 T7.X, T5.X, 54, #3 +; EG-NEXT: Fetch clause starting at 24: +; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3 +; EG-NEXT: VTX_READ_16 T5.X, T5.X, 52, #3 +; EG-NEXT: ALU clause starting at 28: +; EG-NEXT: MOV * T0.Y, T3.X, +; EG-NEXT: MOV * T5.X, 0.0, +; EG-NEXT: ALU clause starting at 30: +; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, +; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: MIN_INT T0.Y, PV.Z, PV.W, -; EG-NEXT: BFE_INT T0.Z, T3.X, 0.0, literal.x, -; EG-NEXT: BFE_INT * T0.W, T2.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W, +; EG-NEXT: LSHL T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, PV.X, +; EG-NEXT: ALU clause starting at 40: +; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, +; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: MIN_INT T0.X, PV.Z, PV.W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: ALU clause starting at 51: +; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, +; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T2.X, PV.W, +; EG-NEXT: MOV * T0.Y, PV.X, +; EG-NEXT: ALU clause starting at 62: +; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x, +; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212 +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W, +; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT T1.W, T0.Y, literal.y, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.z, +; EG-NEXT: 2(2.802597e-45), -65536(nan) +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T6.X, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.X, +; EG-NEXT: MOV * T6.Y, T3.X, ; ; CI-LABEL: s_test_imin_sle_v4i16: ; CI: ; %bb.0: @@ -2154,40 +2205,49 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 { ; EG-LABEL: v_test_umin_ule_v3i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 3 @6 -; EG-NEXT: ALU 17, @18, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT MSKOR T4.XW, T0.X +; EG-NEXT: ALU 3, @20, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @8 +; EG-NEXT: ALU 11, @24, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 3 @12 +; EG-NEXT: ALU 8, @36, KC0[], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0 +; EG-NEXT: MEM_RAT MSKOR T7.XW, T0.X ; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_16 T2.X, T1.X, 0, #1 -; EG-NEXT: VTX_READ_16 T3.X, T0.X, 0, #1 -; EG-NEXT: VTX_READ_16 T1.X, T1.X, 4, #1 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 4, #1 -; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: Fetch clause starting at 8: +; EG-NEXT: VTX_READ_16 T7.X, T6.X, 4, #1 +; EG-NEXT: VTX_READ_16 T8.X, T0.X, 4, #1 +; EG-NEXT: Fetch clause starting at 12: +; EG-NEXT: VTX_READ_16 T8.X, T6.X, 0, #1 +; EG-NEXT: VTX_READ_16 T9.X, T0.X, 0, #1 +; EG-NEXT: VTX_READ_16 T6.X, T6.X, 2, #1 +; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1 +; EG-NEXT: ALU clause starting at 20: ; EG-NEXT: LSHL * T0.W, T0.X, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, -; EG-NEXT: ALU clause starting at 18: +; EG-NEXT: ADD_INT * T6.X, KC0[2].W, PV.W, +; EG-NEXT: ALU clause starting at 24: ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x, ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) ; EG-NEXT: AND_INT * T2.W, PV.W, literal.x, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: LSHL T2.W, PV.W, literal.x, -; EG-NEXT: MIN_UINT * T3.W, T0.X, T1.X, +; EG-NEXT: MIN_UINT * T3.W, T8.X, T7.X, ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T4.X, PS, PV.W, -; EG-NEXT: LSHL * T4.W, literal.x, PV.W, +; EG-NEXT: LSHL T7.X, PS, PV.W, +; EG-NEXT: LSHL * T7.W, literal.x, PV.W, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MOV T4.Y, 0.0, -; EG-NEXT: MOV * T4.Z, 0.0, +; EG-NEXT: MOV * T7.Y, 0.0, +; EG-NEXT: ALU clause starting at 36: +; EG-NEXT: MOV T7.Z, 0.0, +; EG-NEXT: MIN_UINT * T2.W, T0.X, T6.X, ; EG-NEXT: LSHR T0.X, T1.W, literal.x, -; EG-NEXT: MIN_UINT * T1.X, T3.X, T2.X, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: LSHR * T2.X, T0.W, literal.x, +; EG-NEXT: LSHL T1.W, PV.W, literal.y, +; EG-NEXT: MIN_UINT * T2.W, T9.X, T8.X, +; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44) +; EG-NEXT: OR_INT T6.X, PV.W, PS, +; EG-NEXT: LSHR * T8.X, T0.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; ; CI-LABEL: v_test_umin_ule_v3i16: @@ -3483,46 +3543,142 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32 define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 { ; EG-LABEL: s_test_umin_ult_v8i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @24, KC0[], KC1[] -; EG-NEXT: TEX 2 @8 -; EG-NEXT: ALU 2, @25, KC0[], KC1[] -; EG-NEXT: TEX 4 @14 -; EG-NEXT: ALU 14, @28, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; EG-NEXT: ALU 1, @52, KC0[], KC1[] +; EG-NEXT: TEX 1 @20 +; EG-NEXT: ALU 9, @54, KC0[], KC1[] +; EG-NEXT: TEX 1 @24 +; EG-NEXT: ALU 8, @64, KC0[], KC1[] +; EG-NEXT: TEX 1 @28 +; EG-NEXT: ALU 10, @73, KC0[], KC1[] +; EG-NEXT: TEX 1 @32 +; EG-NEXT: ALU 8, @84, KC0[], KC1[] +; EG-NEXT: TEX 1 @36 +; EG-NEXT: ALU 10, @93, KC0[], KC1[] +; EG-NEXT: TEX 1 @40 +; EG-NEXT: ALU 8, @104, KC0[], KC1[] +; EG-NEXT: TEX 1 @44 +; EG-NEXT: ALU 10, @113, KC0[], KC1[] +; EG-NEXT: TEX 1 @48 +; EG-NEXT: ALU 10, @124, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 8: -; EG-NEXT: VTX_READ_16 T1.X, T0.X, 62, #3 -; EG-NEXT: VTX_READ_16 T2.X, T0.X, 60, #3 -; EG-NEXT: VTX_READ_16 T3.X, T0.X, 78, #3 -; EG-NEXT: Fetch clause starting at 14: -; EG-NEXT: VTX_READ_16 T1.X, T0.X, 68, #3 -; EG-NEXT: VTX_READ_16 T3.X, T0.X, 52, #3 -; EG-NEXT: VTX_READ_16 T4.X, T0.X, 70, #3 -; EG-NEXT: VTX_READ_16 T5.X, T0.X, 54, #3 -; EG-NEXT: VTX_READ_16 T0.X, T0.X, 76, #3 -; EG-NEXT: ALU clause starting at 24: -; EG-NEXT: MOV * T0.X, 0.0, -; EG-NEXT: ALU clause starting at 25: -; EG-NEXT: AND_INT T0.W, T1.X, literal.x, -; EG-NEXT: AND_INT * T1.W, T3.X, literal.x, +; EG-NEXT: Fetch clause starting at 20: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3 +; EG-NEXT: VTX_READ_16 T9.X, T7.X, 82, #3 +; EG-NEXT: Fetch clause starting at 24: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3 +; EG-NEXT: VTX_READ_16 T9.X, T7.X, 80, #3 +; EG-NEXT: Fetch clause starting at 28: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3 +; EG-NEXT: VTX_READ_16 T9.X, T7.X, 78, #3 +; EG-NEXT: Fetch clause starting at 32: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3 +; EG-NEXT: VTX_READ_16 T9.X, T7.X, 76, #3 +; EG-NEXT: Fetch clause starting at 36: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3 +; EG-NEXT: VTX_READ_16 T9.X, T7.X, 74, #3 +; EG-NEXT: Fetch clause starting at 40: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3 +; EG-NEXT: VTX_READ_16 T9.X, T7.X, 72, #3 +; EG-NEXT: Fetch clause starting at 44: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3 +; EG-NEXT: VTX_READ_16 T9.X, T7.X, 70, #3 +; EG-NEXT: Fetch clause starting at 48: +; EG-NEXT: VTX_READ_16 T8.X, T7.X, 52, #3 +; EG-NEXT: VTX_READ_16 T7.X, T7.X, 68, #3 +; EG-NEXT: ALU clause starting at 52: +; EG-NEXT: MOV * T0.Y, T3.X, +; EG-NEXT: MOV * T7.X, 0.0, +; EG-NEXT: ALU clause starting at 54: +; EG-NEXT: AND_INT T0.W, T8.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: ALU clause starting at 28: -; EG-NEXT: AND_INT T0.Z, T2.X, literal.x, -; EG-NEXT: AND_INT T2.W, T0.X, literal.x, BS:VEC_120/SCL_212 -; EG-NEXT: MIN_UINT * T0.W, T0.W, T1.W, +; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, +; EG-NEXT: LSHL T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, PV.X, +; EG-NEXT: ALU clause starting at 64: +; EG-NEXT: AND_INT T0.W, T8.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MIN_UINT T0.Z, PV.Z, PV.W, -; EG-NEXT: AND_INT T1.W, T5.X, literal.x, -; EG-NEXT: AND_INT * T2.W, T4.X, literal.x, +; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, +; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T3.X, PV.W, +; EG-NEXT: MOV * T0.Y, T2.X, +; EG-NEXT: ALU clause starting at 73: +; EG-NEXT: AND_INT T0.W, T8.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MIN_UINT T0.Y, PV.W, PS, -; EG-NEXT: AND_INT T1.W, T3.X, literal.x, -; EG-NEXT: AND_INT * T2.W, T1.X, literal.x, +; EG-NEXT: MIN_UINT T0.W, PV.W, PS, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: MIN_UINT T0.X, PV.W, PS, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T2.X, PV.W, +; EG-NEXT: MOV * T0.Y, PV.X, +; EG-NEXT: ALU clause starting at 84: +; EG-NEXT: AND_INT T0.W, T8.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, +; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T7.Z, PV.W, PS, +; EG-NEXT: MOV T2.X, PV.Z, +; EG-NEXT: MOV * T0.Y, T5.X, +; EG-NEXT: ALU clause starting at 93: +; EG-NEXT: AND_INT T0.W, T8.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: MIN_UINT T0.W, PV.W, PS, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T5.X, PV.W, +; EG-NEXT: MOV * T0.Y, PV.X, +; EG-NEXT: ALU clause starting at 104: +; EG-NEXT: AND_INT T0.W, T8.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: AND_INT T2.W, T0.Y, literal.x, +; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, PV.W, PS, +; EG-NEXT: MOV T5.X, PV.W, +; EG-NEXT: MOV * T0.Y, T4.X, +; EG-NEXT: ALU clause starting at 113: +; EG-NEXT: AND_INT T0.W, T8.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: MIN_UINT T0.W, PV.W, PS, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T4.X, PV.W, +; EG-NEXT: MOV * T0.Y, PV.X, +; EG-NEXT: ALU clause starting at 124: +; EG-NEXT: AND_INT T0.W, T8.X, literal.x, +; EG-NEXT: AND_INT * T1.W, T7.X, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT T2.W, T0.Y, literal.y, +; EG-NEXT: MIN_UINT * T0.W, PV.W, PS, +; EG-NEXT: 2(2.802597e-45), -65536(nan) +; EG-NEXT: OR_INT * T7.X, PV.W, PS, +; EG-NEXT: MOV T4.X, PV.X, +; EG-NEXT: MOV * T7.W, T3.X, +; EG-NEXT: MOV * T7.Y, T5.X, ; ; CI-LABEL: s_test_umin_ult_v8i16: ; CI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll new file mode 100644 index 0000000..6d0aa1e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll @@ -0,0 +1,108 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %val4, <16 x i64> %val16) { +; CHECK-LABEL: no_folding_imm_to_inst_with_fi: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: s_load_b256 s[36:43], s[4:5], 0x24 +; CHECK-NEXT: s_load_b512 s[16:31], s[4:5], 0xe4 +; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0xa4 +; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base +; CHECK-NEXT: s_movk_i32 s33, 0x70 +; CHECK-NEXT: s_movk_i32 s34, 0x60 +; CHECK-NEXT: s_or_b32 s44, 0x80, s33 +; CHECK-NEXT: s_mov_b32 s45, s35 +; CHECK-NEXT: s_or_b32 s46, 0x80, s34 +; CHECK-NEXT: s_mov_b32 s47, s35 +; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45 +; CHECK-NEXT: v_dual_mov_b32 v22, s46 :: v_dual_mov_b32 v23, s47 +; CHECK-NEXT: s_movk_i32 s34, 0x80 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v35, s35 +; CHECK-NEXT: s_wait_kmcnt 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41 +; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43 +; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37 +; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39 +; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21 +; CHECK-NEXT: s_movk_i32 s20, 0x50 +; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29 +; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: s_or_b32 s20, 0x80, s20 +; CHECK-NEXT: s_mov_b32 s21, s35 +; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25 +; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27 +; CHECK-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20 +; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[24:25], v[0:3] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17 +; CHECK-NEXT: s_or_b32 s16, 0x80, 64 +; CHECK-NEXT: s_mov_b32 s17, s35 +; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13 +; CHECK-NEXT: s_or_b32 s12, 0x80, 48 +; CHECK-NEXT: s_mov_b32 s13, s35 +; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; CHECK-NEXT: s_or_b32 s8, 0x80, 32 +; CHECK-NEXT: s_mov_b32 s9, s35 +; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5 +; CHECK-NEXT: s_or_b32 s4, 0x80, 16 +; CHECK-NEXT: s_mov_b32 s5, s35 +; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19 +; CHECK-NEXT: s_wait_alu 0xfffe +; CHECK-NEXT: v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v26, s16 +; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15 +; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12 +; CHECK-NEXT: v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v30, s8 +; CHECK-NEXT: v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v32, s4 +; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7 +; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1 +; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3 +; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[30:31], v[8:11] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[32:33], v[12:15] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_store_b128 v[34:35], v[16:19] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_storecnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[30:31] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[34:35] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT: flat_load_b128 v[0:3], v[32:33] scope:SCOPE_SYS +; CHECK-NEXT: s_wait_loadcnt 0x0 +; CHECK-NEXT: s_endpgm +bb: + %alloca = alloca <4 x i64>, align 32, addrspace(5) + %alloca1 = alloca <16 x i64>, align 128, addrspace(5) + store volatile <4 x i64> %val4, ptr addrspace(5) %alloca + %ascast = addrspacecast ptr addrspace(5) %alloca1 to ptr + store volatile <16 x i64> %val16, ptr %ascast + %load = load volatile <16 x i64>, ptr %ascast + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 5b0d2d2..42401af 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -3238,11 +3238,8 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; GFX90A-GISEL-LABEL: fadd_fadd_fsub_0: ; GFX90A-GISEL: ; %bb.0: ; %bb ; GFX90A-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX90A-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX90A-GISEL-NEXT: s_mov_b32 s3, s2 -; GFX90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0 ; GFX90A-GISEL-NEXT: v_mov_b32_e32 v0, v1 ; GFX90A-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 ; GFX90A-GISEL-NEXT: v_mov_b32_e32 v2, s0 @@ -3253,11 +3250,8 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) { ; GFX942-GISEL-LABEL: fadd_fadd_fsub_0: ; GFX942-GISEL: ; %bb.0: ; %bb ; GFX942-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX942-GISEL-NEXT: s_mov_b32 s2, 0 -; GFX942-GISEL-NEXT: s_mov_b32 s3, s2 -; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], v[0:1] +; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], s[0:1], 0 ; GFX942-GISEL-NEXT: s_nop 0 ; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, v1 ; GFX942-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 8fe68ba..f0c8fed 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -533,8 +533,9 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 ; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 ; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], -1 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[8:9] +; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[4:5], s[14:15] ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 @@ -1930,19 +1931,16 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll index 735720a..725d57d 100644 --- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -285,7 +285,7 @@ define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_store_b32_idxprom: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b32_e32 v1, 1.0 -; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset +; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SE ; GCN-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 @@ -298,7 +298,7 @@ define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) { ; GCN-LABEL: flat_store_b16_idxprom: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset +; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset scope:SCOPE_SE ; GCN-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 @@ -311,7 +311,7 @@ define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) { ; GCN-LABEL: flat_store_b64_idxprom: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 -; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset +; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset scope:SCOPE_SE ; GCN-NEXT: s_endpgm entry: %idxprom = sext i32 %idx to i64 @@ -337,12 +337,15 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1] -; SDAG-NEXT: s_mov_b64 s[0:1], src_private_base -; SDAG-NEXT: s_mov_b32 s0, exec_lo +; SDAG-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: v_xor_b32_e32 v0, s0, v3 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x3ffffff, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; SDAG-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 ; SDAG-NEXT: s_cbranch_execnz .LBB21_3 ; SDAG-NEXT: ; %bb.1: ; %Flow @@ -360,13 +363,16 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 ; SDAG-NEXT: s_cbranch_execz .LBB21_2 ; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private +; SDAG-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo ; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: v_subrev_nc_u32_e32 v0, s1, v2 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off ; SDAG-NEXT: s_wait_loadcnt 0x0 ; SDAG-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] -; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE ; SDAG-NEXT: s_wait_xcnt 0x0 ; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; SDAG-NEXT: s_branch .LBB21_5 @@ -374,19 +380,21 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: ; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi ; GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GISEL-NEXT: s_mov_b64 s[2:3], src_private_base -; GISEL-NEXT: s_mov_b32 s2, exec_lo ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[2:3] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0 ; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_xor_b32_e32 v0, s2, v5 +; GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0 ; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-NEXT: v_cmpx_ne_u32_e64 s3, v5 +; GISEL-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GISEL-NEXT: s_xor_b32 s2, exec_lo, s2 ; GISEL-NEXT: s_cbranch_execnz .LBB21_3 ; GISEL-NEXT: ; %bb.1: ; %Flow @@ -398,19 +406,22 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg % ; GISEL-NEXT: s_branch .LBB21_5 ; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global ; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1 -; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: ; implicit-def: $vgpr4 ; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GISEL-NEXT: s_wait_xcnt 0x0 ; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 ; GISEL-NEXT: s_cbranch_execz .LBB21_2 ; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private +; GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo ; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo ; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v4 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo ; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off ; GISEL-NEXT: s_wait_loadcnt 0x0 ; GISEL-NEXT: v_add_nc_u64_e32 v[2:3], 1, v[0:1] -; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off +; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off scope:SCOPE_SE ; GISEL-NEXT: s_wait_xcnt 0x0 ; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GISEL-NEXT: s_branch .LBB21_5 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll index 676359fce..5c0f813 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll @@ -391,144 +391,156 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: sdiv_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s10, s2 +; GCN-NEXT: s_mov_b32 s11, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 ; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v2 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v3 -; GCN-NEXT: v_xor_b32_e32 v4, v0, v2 -; GCN-NEXT: v_xor_b32_e32 v7, v1, v3 -; GCN-NEXT: v_max_i32_e32 v2, v2, v6 -; GCN-NEXT: v_max_i32_e32 v3, v3, v9 -; GCN-NEXT: v_cvt_f32_u32_e32 v6, v2 -; GCN-NEXT: v_cvt_f32_u32_e32 v9, v3 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GCN-NEXT: v_max_i32_e32 v0, v0, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v9 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; GCN-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; GCN-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; GCN-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v3 -; GCN-NEXT: v_mul_lo_u32 v9, v9, v6 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v5 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, 0, v1 -; GCN-NEXT: v_mul_hi_u32 v9, v6, v9 -; GCN-NEXT: v_max_i32_e32 v1, v1, v8 -; GCN-NEXT: v_mul_hi_u32 v8, v5, v10 -; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v6 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GCN-NEXT: v_mul_lo_u32 v8, v6, v2 -; GCN-NEXT: v_mul_lo_u32 v10, v5, v3 -; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v6 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 -; GCN-NEXT: v_add_i32_e32 v11, vcc, 1, v5 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v9, vcc, v1, v3 -; GCN-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v6 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] -; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v5 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GCN-NEXT: v_xor_b32_e32 v0, v0, v4 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v7 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v7 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-NEXT: s_abs_i32 s1, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s1 +; GCN-NEXT: s_sub_i32 s6, 0, s1 +; GCN-NEXT: v_readfirstlane_b32 s8, v3 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_lo_u32 v4, s6, v2 +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: s_abs_i32 s7, s6 +; GCN-NEXT: s_xor_b32 s0, s6, s0 +; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 +; GCN-NEXT: s_ashr_i32 s6, s0, 31 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_mul_i32 s0, s0, s1 +; GCN-NEXT: s_sub_i32 s0, s7, s0 +; GCN-NEXT: s_sub_i32 s7, s0, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GCN-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: s_cselect_b32 s0, s7, s0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GCN-NEXT: s_cmp_ge_u32 s0, s1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_abs_i32 s7, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s7 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_sub_i32 s4, 0, s7 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 +; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v3 +; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: s_xor_b32 s5, s4, s8 +; GCN-NEXT: s_abs_i32 s4, s4 +; GCN-NEXT: v_mul_hi_u32 v1, v3, v4 +; GCN-NEXT: s_ashr_i32 s5, s5, 31 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-NEXT: v_mul_hi_u32 v1, s4, v1 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: s_mul_i32 s6, s6, s7 +; GCN-NEXT: s_sub_i32 s4, s4, s6 +; GCN-NEXT: s_sub_i32 s6, s4, s7 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; GCN-NEXT: s_cmp_ge_u32 s4, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: s_cselect_b32 s4, s6, s4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 +; GCN-NEXT: s_cmp_ge_u32 s4, s7 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_xor_b32_e32 v1, s5, v1 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v2i32: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 +; TONGA-NEXT: s_mov_b32 s3, 0xf000 +; TONGA-NEXT: s_mov_b32 s2, -1 +; TONGA-NEXT: s_mov_b32 s10, s2 +; TONGA-NEXT: s_mov_b32 s11, s3 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 +; TONGA-NEXT: s_mov_b32 s8, s6 +; TONGA-NEXT: s_mov_b32 s9, s7 ; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3 -; TONGA-NEXT: v_xor_b32_e32 v4, v0, v2 -; TONGA-NEXT: v_xor_b32_e32 v7, v1, v3 -; TONGA-NEXT: v_max_i32_e32 v2, v2, v6 -; TONGA-NEXT: v_max_i32_e32 v3, v3, v9 -; TONGA-NEXT: v_cvt_f32_u32_e32 v6, v2 -; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v3 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; TONGA-NEXT: v_max_i32_e32 v0, v0, v5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v5, v9 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 -; TONGA-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 -; TONGA-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 -; TONGA-NEXT: v_cvt_u32_f32_e32 v6, v6 -; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v5 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v3 -; TONGA-NEXT: v_mul_lo_u32 v9, v9, v6 -; TONGA-NEXT: v_mul_lo_u32 v10, v10, v5 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, 0, v1 -; TONGA-NEXT: v_mul_hi_u32 v9, v6, v9 -; TONGA-NEXT: v_max_i32_e32 v1, v1, v8 -; TONGA-NEXT: v_mul_hi_u32 v8, v5, v10 -; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v4 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v9 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v5, v8 -; TONGA-NEXT: v_mul_hi_u32 v6, v0, v6 -; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; TONGA-NEXT: v_mul_lo_u32 v8, v6, v2 -; TONGA-NEXT: v_mul_lo_u32 v10, v5, v3 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v6 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v10 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, 1, v5 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v2 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v1, v3 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v0, v2 -; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v1, v3 -; TONGA-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[2:3] -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v6 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[2:3] -; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v5 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; TONGA-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v4 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v7 -; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; TONGA-NEXT: v_readfirstlane_b32 s0, v2 +; TONGA-NEXT: s_abs_i32 s1, s0 +; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s1 +; TONGA-NEXT: s_sub_i32 s6, 0, s1 +; TONGA-NEXT: v_readfirstlane_b32 s8, v3 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2 +; TONGA-NEXT: v_mul_lo_u32 v4, s6, v2 +; TONGA-NEXT: v_readfirstlane_b32 s6, v0 +; TONGA-NEXT: s_abs_i32 s7, s6 +; TONGA-NEXT: s_xor_b32 s0, s6, s0 +; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 +; TONGA-NEXT: s_ashr_i32 s6, s0, 31 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v4 +; TONGA-NEXT: v_mul_hi_u32 v0, s7, v0 +; TONGA-NEXT: v_readfirstlane_b32 s0, v0 +; TONGA-NEXT: s_mul_i32 s0, s0, s1 +; TONGA-NEXT: s_sub_i32 s0, s7, s0 +; TONGA-NEXT: s_sub_i32 s7, s0, s1 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; TONGA-NEXT: s_cmp_ge_u32 s0, s1 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; TONGA-NEXT: s_cselect_b32 s0, s7, s0 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; TONGA-NEXT: s_cmp_ge_u32 s0, s1 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: s_abs_i32 s7, s8 +; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s7 +; TONGA-NEXT: s_mov_b32 s0, s4 +; TONGA-NEXT: s_sub_i32 s4, 0, s7 +; TONGA-NEXT: s_mov_b32 s1, s5 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; TONGA-NEXT: v_xor_b32_e32 v0, s6, v0 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s6, v0 +; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 +; TONGA-NEXT: v_mul_lo_u32 v4, s4, v3 +; TONGA-NEXT: v_readfirstlane_b32 s4, v1 +; TONGA-NEXT: s_xor_b32 s5, s4, s8 +; TONGA-NEXT: s_abs_i32 s4, s4 +; TONGA-NEXT: v_mul_hi_u32 v1, v3, v4 +; TONGA-NEXT: s_ashr_i32 s5, s5, 31 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1 +; TONGA-NEXT: v_readfirstlane_b32 s6, v1 +; TONGA-NEXT: s_mul_i32 s6, s6, s7 +; TONGA-NEXT: s_sub_i32 s4, s4, s6 +; TONGA-NEXT: s_sub_i32 s6, s4, s7 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1 +; TONGA-NEXT: s_cmp_ge_u32 s4, s7 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; TONGA-NEXT: s_cselect_b32 s4, s6, s4 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, 1, v1 +; TONGA-NEXT: s_cmp_ge_u32 s4, s7 +; TONGA-NEXT: s_cselect_b64 vcc, -1, 0 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; TONGA-NEXT: v_xor_b32_e32 v1, s5, v1 +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s5, v1 +; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32: @@ -546,44 +558,44 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 ; GFX9-NEXT: s_abs_i32 s1, s0 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_xor_b32 s0, s5, s0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_xor_b32 s0, s4, s0 ; GFX9-NEXT: s_ashr_i32 s6, s0, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_sub_i32 s0, 0, s1 -; GFX9-NEXT: s_abs_i32 s5, s5 -; GFX9-NEXT: v_readfirstlane_b32 s4, v3 +; GFX9-NEXT: s_abs_i32 s4, s4 +; GFX9-NEXT: v_readfirstlane_b32 s5, v3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s0, s0, s7 ; GFX9-NEXT: s_mul_hi_u32 s0, s7, s0 ; GFX9-NEXT: s_add_i32 s7, s7, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s5, s7 +; GFX9-NEXT: s_mul_hi_u32 s0, s4, s7 ; GFX9-NEXT: s_mul_i32 s7, s0, s1 -; GFX9-NEXT: s_sub_i32 s5, s5, s7 +; GFX9-NEXT: s_sub_i32 s4, s4, s7 ; GFX9-NEXT: s_add_i32 s10, s0, 1 -; GFX9-NEXT: s_sub_i32 s7, s5, s1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 +; GFX9-NEXT: s_sub_i32 s7, s4, s1 +; GFX9-NEXT: s_cmp_ge_u32 s4, s1 ; GFX9-NEXT: s_cselect_b32 s0, s10, s0 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 ; GFX9-NEXT: s_add_i32 s7, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s5, s1 -; GFX9-NEXT: s_cselect_b32 s5, s7, s0 -; GFX9-NEXT: s_abs_i32 s7, s4 +; GFX9-NEXT: s_cmp_ge_u32 s4, s1 +; GFX9-NEXT: s_cselect_b32 s4, s7, s0 +; GFX9-NEXT: s_abs_i32 s7, s5 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_xor_b32 s5, s5, s6 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 ; GFX9-NEXT: s_mov_b32 s1, s9 ; GFX9-NEXT: s_sub_i32 s9, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s4, s4, s6 ; GFX9-NEXT: s_mov_b32 s0, s8 ; GFX9-NEXT: v_readfirstlane_b32 s8, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s4, s8, s4 +; GFX9-NEXT: s_xor_b32 s5, s8, s5 ; GFX9-NEXT: s_abs_i32 s8, s8 -; GFX9-NEXT: s_ashr_i32 s4, s4, 31 +; GFX9-NEXT: s_ashr_i32 s5, s5, 31 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 ; GFX9-NEXT: s_mul_i32 s9, s9, s6 ; GFX9-NEXT: s_mul_hi_u32 s9, s6, s9 @@ -599,10 +611,10 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GFX9-NEXT: s_add_i32 s9, s6, 1 ; GFX9-NEXT: s_cmp_ge_u32 s8, s7 ; GFX9-NEXT: s_cselect_b32 s6, s9, s6 -; GFX9-NEXT: s_xor_b32 s6, s6, s4 -; GFX9-NEXT: s_sub_i32 s4, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: s_xor_b32 s6, s6, s5 +; GFX9-NEXT: s_sub_i32 s5, s6, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -792,255 +804,255 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-LABEL: sdiv_v4i32: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s10, s6 -; GCN-NEXT: s_mov_b32 s11, s7 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s6, s10 +; GCN-NEXT: s_mov_b32 s7, s11 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s2 -; GCN-NEXT: s_mov_b32 s9, s3 -; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s8, s0 +; GCN-NEXT: s_mov_b32 s9, s1 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v0 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v4 -; GCN-NEXT: v_xor_b32_e32 v8, v0, v4 -; GCN-NEXT: v_max_i32_e32 v4, v4, v10 -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v4 -; GCN-NEXT: v_sub_i32_e32 v13, vcc, 0, v5 -; GCN-NEXT: v_xor_b32_e32 v11, v1, v5 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_max_i32_e32 v5, v5, v13 -; GCN-NEXT: v_cvt_f32_u32_e32 v13, v5 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v4 -; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GCN-NEXT: v_rcp_iflag_f32_e32 v13, v13 -; GCN-NEXT: v_sub_i32_e32 v12, vcc, 0, v1 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v10 -; GCN-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 -; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GCN-NEXT: v_max_i32_e32 v0, v0, v9 -; GCN-NEXT: v_mul_hi_u32 v16, v10, v16 -; GCN-NEXT: v_max_i32_e32 v1, v1, v12 -; GCN-NEXT: v_sub_i32_e32 v15, vcc, 0, v6 -; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v16 -; GCN-NEXT: v_sub_i32_e32 v16, vcc, 0, v5 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v13 -; GCN-NEXT: v_mul_hi_u32 v10, v0, v10 -; GCN-NEXT: v_xor_b32_e32 v14, v2, v6 -; GCN-NEXT: v_max_i32_e32 v6, v6, v15 -; GCN-NEXT: v_mul_hi_u32 v12, v13, v16 -; GCN-NEXT: v_cvt_f32_u32_e32 v15, v6 -; GCN-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GCN-NEXT: v_ashrrev_i32_e32 v11, 31, v11 -; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 -; GCN-NEXT: v_mul_lo_u32 v13, v10, v4 -; GCN-NEXT: v_mul_hi_u32 v12, v1, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v9, v15 -; GCN-NEXT: v_ashrrev_i32_e32 v14, 31, v14 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v13, vcc, v0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; GCN-NEXT: v_mul_lo_u32 v0, v12, v5 -; GCN-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v6 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, 1, v12 -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; GCN-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] -; GCN-NEXT: v_sub_i32_e32 v12, vcc, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v4, v4, v9 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, 0, v7 -; GCN-NEXT: v_max_i32_e32 v5, v7, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, v5 -; GCN-NEXT: v_mul_hi_u32 v4, v9, v4 -; GCN-NEXT: v_add_i32_e32 v13, vcc, 1, v10 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: s_abs_i32 s13, s0 +; GCN-NEXT: s_abs_i32 s14, s1 +; GCN-NEXT: s_abs_i32 s15, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s13 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s14 +; GCN-NEXT: v_cvt_f32_u32_e32 v2, s15 +; GCN-NEXT: v_readfirstlane_b32 s6, v3 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_sub_i32_e32 v9, vcc, 0, v2 -; GCN-NEXT: v_max_i32_e32 v2, v2, v9 -; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 +; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GCN-NEXT: s_abs_i32 s17, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v3, s17 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1] -; GCN-NEXT: v_xor_b32_e32 v0, v0, v8 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 -; GCN-NEXT: v_mul_lo_u32 v8, v4, v6 -; GCN-NEXT: v_add_i32_e32 v12, vcc, 1, v1 -; GCN-NEXT: v_sub_i32_e32 v10, vcc, 0, v5 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; GCN-NEXT: v_mul_lo_u32 v10, v10, v9 -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; GCN-NEXT: v_sub_i32_e32 v8, vcc, v2, v6 -; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v11 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; GCN-NEXT: v_mul_hi_u32 v4, v9, v10 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 -; GCN-NEXT: v_max_i32_e32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v9, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v6, v4 -; GCN-NEXT: v_xor_b32_e32 v2, v2, v14 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, v2, v14 -; GCN-NEXT: v_mul_lo_u32 v8, v4, v5 -; GCN-NEXT: v_xor_b32_e32 v3, v3, v7 -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 -; GCN-NEXT: v_sub_i32_e32 v8, vcc, v6, v5 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v4 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GCN-NEXT: v_xor_b32_e32 v4, v4, v3 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s3, v4 +; GCN-NEXT: v_readfirstlane_b32 s4, v5 +; GCN-NEXT: v_readfirstlane_b32 s5, v6 +; GCN-NEXT: s_xor_b32 s12, s3, s0 +; GCN-NEXT: s_xor_b32 s0, s4, s1 +; GCN-NEXT: s_xor_b32 s1, s5, s2 +; GCN-NEXT: s_sub_i32 s2, 0, s13 +; GCN-NEXT: s_ashr_i32 s18, s0, 31 +; GCN-NEXT: s_sub_i32 s0, 0, s14 +; GCN-NEXT: s_ashr_i32 s19, s1, 31 +; GCN-NEXT: s_sub_i32 s1, 0, s15 +; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s0, v1 +; GCN-NEXT: v_mul_lo_u32 v6, s1, v2 +; GCN-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 +; GCN-NEXT: s_sub_i32 s20, 0, s17 +; GCN-NEXT: v_readfirstlane_b32 s7, v7 +; GCN-NEXT: s_abs_i32 s3, s3 +; GCN-NEXT: s_abs_i32 s4, s4 +; GCN-NEXT: s_abs_i32 s5, s5 +; GCN-NEXT: v_mul_lo_u32 v7, s20, v3 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v1, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s5, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s13 +; GCN-NEXT: v_mul_lo_u32 v6, v1, s14 +; GCN-NEXT: v_mul_lo_u32 v8, v2, s15 +; GCN-NEXT: s_abs_i32 s16, s7 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GCN-NEXT: v_mul_hi_u32 v3, s16, v3 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v4 +; GCN-NEXT: v_sub_i32_e32 v6, vcc, s4, v6 +; GCN-NEXT: v_sub_i32_e32 v8, vcc, s5, v8 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v1 +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v2 +; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6 +; GCN-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8 +; GCN-NEXT: v_subrev_i32_e32 v10, vcc, s13, v4 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] +; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s14, v6 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] +; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s15, v8 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] +; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] +; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v1 +; GCN-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5] +; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v2 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v3, s17 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; GCN-NEXT: s_ashr_i32 s12, s12, 31 +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s18, v1 +; GCN-NEXT: v_xor_b32_e32 v2, s19, v2 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, s16, v4 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 +; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s18, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s19, v2 +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s17, v4 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 +; GCN-NEXT: s_xor_b32 s0, s7, s6 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 +; GCN-NEXT: s_ashr_i32 s0, s0, 31 +; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GCN-NEXT: v_xor_b32_e32 v3, s0, v3 +; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v3 +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; ; TONGA-LABEL: sdiv_v4i32: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; TONGA-NEXT: s_mov_b32 s7, 0xf000 -; TONGA-NEXT: s_mov_b32 s6, -1 -; TONGA-NEXT: s_mov_b32 s10, s6 -; TONGA-NEXT: s_mov_b32 s11, s7 +; TONGA-NEXT: s_mov_b32 s11, 0xf000 +; TONGA-NEXT: s_mov_b32 s10, -1 +; TONGA-NEXT: s_mov_b32 s6, s10 +; TONGA-NEXT: s_mov_b32 s7, s11 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: s_mov_b32 s8, s2 -; TONGA-NEXT: s_mov_b32 s9, s3 -; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; TONGA-NEXT: s_mov_b32 s4, s0 -; TONGA-NEXT: s_mov_b32 s5, s1 +; TONGA-NEXT: s_mov_b32 s4, s2 +; TONGA-NEXT: s_mov_b32 s5, s3 +; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 +; TONGA-NEXT: s_mov_b32 s8, s0 +; TONGA-NEXT: s_mov_b32 s9, s1 ; TONGA-NEXT: s_waitcnt vmcnt(1) -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v0 -; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v4 -; TONGA-NEXT: v_xor_b32_e32 v8, v0, v4 -; TONGA-NEXT: v_max_i32_e32 v4, v4, v10 -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v4 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, 0, v5 -; TONGA-NEXT: v_xor_b32_e32 v11, v1, v5 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; TONGA-NEXT: v_max_i32_e32 v5, v5, v13 -; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v5 -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v4 -; TONGA-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; TONGA-NEXT: v_cvt_u32_f32_e32 v10, v10 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v13, v13 -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1 -; TONGA-NEXT: v_mul_lo_u32 v16, v16, v10 -; TONGA-NEXT: v_mul_f32_e32 v13, 0x4f7ffffe, v13 -; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v13 -; TONGA-NEXT: v_max_i32_e32 v0, v0, v9 -; TONGA-NEXT: v_mul_hi_u32 v16, v10, v16 -; TONGA-NEXT: v_max_i32_e32 v1, v1, v12 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v6 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v16 -; TONGA-NEXT: v_sub_u32_e32 v16, vcc, 0, v5 -; TONGA-NEXT: v_mul_lo_u32 v16, v16, v13 -; TONGA-NEXT: v_mul_hi_u32 v10, v0, v10 -; TONGA-NEXT: v_xor_b32_e32 v14, v2, v6 -; TONGA-NEXT: v_max_i32_e32 v6, v6, v15 -; TONGA-NEXT: v_mul_hi_u32 v12, v13, v16 -; TONGA-NEXT: v_cvt_f32_u32_e32 v15, v6 -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; TONGA-NEXT: v_ashrrev_i32_e32 v11, 31, v11 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v13, v12 -; TONGA-NEXT: v_mul_lo_u32 v13, v10, v4 -; TONGA-NEXT: v_mul_hi_u32 v12, v1, v12 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v9, v15 -; TONGA-NEXT: v_ashrrev_i32_e32 v14, 31, v14 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v13 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v10, v10, v13, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v0, v4 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v13, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v0, v4 -; TONGA-NEXT: v_mul_lo_u32 v0, v12, v5 -; TONGA-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v9 -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, 0, v6 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, 1, v12 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[2:3] -; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v0, v5 -; TONGA-NEXT: v_mul_lo_u32 v4, v4, v9 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v0, v5 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, 0, v7 -; TONGA-NEXT: v_max_i32_e32 v5, v7, v0 -; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v5 -; TONGA-NEXT: v_mul_hi_u32 v4, v9, v4 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, 1, v10 +; TONGA-NEXT: v_readfirstlane_b32 s0, v0 +; TONGA-NEXT: v_readfirstlane_b32 s1, v1 +; TONGA-NEXT: v_readfirstlane_b32 s2, v2 +; TONGA-NEXT: s_abs_i32 s13, s0 +; TONGA-NEXT: s_abs_i32 s14, s1 +; TONGA-NEXT: s_abs_i32 s15, s2 +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s13 +; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s14 +; TONGA-NEXT: v_cvt_f32_u32_e32 v2, s15 +; TONGA-NEXT: v_readfirstlane_b32 s6, v3 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v2 -; TONGA-NEXT: v_max_i32_e32 v2, v2, v9 -; TONGA-NEXT: v_mul_hi_u32 v4, v2, v4 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; TONGA-NEXT: s_abs_i32 s17, s6 +; TONGA-NEXT: v_cvt_f32_u32_e32 v3, s17 ; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0 -; TONGA-NEXT: v_cndmask_b32_e64 v0, v10, v13, s[0:1] -; TONGA-NEXT: v_xor_b32_e32 v0, v0, v8 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 -; TONGA-NEXT: v_mul_lo_u32 v8, v4, v6 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, 1, v1 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, 0, v5 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v12, s[2:3] -; TONGA-NEXT: v_mul_lo_u32 v10, v10, v9 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e64 s[0:1], v2, v6 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[0:1] -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v2, v6 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v1, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 -; TONGA-NEXT: v_cndmask_b32_e32 v2, v4, v8, vcc -; TONGA-NEXT: v_mul_hi_u32 v4, v9, v10 -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v3 -; TONGA-NEXT: v_max_i32_e32 v6, v3, v6 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v9, v4 -; TONGA-NEXT: v_mul_hi_u32 v4, v6, v4 -; TONGA-NEXT: v_xor_b32_e32 v2, v2, v14 -; TONGA-NEXT: v_sub_u32_e32 v2, vcc, v2, v14 -; TONGA-NEXT: v_mul_lo_u32 v8, v4, v5 -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 -; TONGA-NEXT: v_sub_u32_e32 v6, vcc, v6, v8 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v6, v5 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc -; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v4 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v6, v5 -; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v3 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; TONGA-NEXT: v_xor_b32_e32 v4, v4, v3 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v4, v3 -; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; TONGA-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0 +; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1 +; TONGA-NEXT: v_cvt_u32_f32_e32 v2, v2 +; TONGA-NEXT: s_waitcnt vmcnt(0) +; TONGA-NEXT: v_readfirstlane_b32 s3, v4 +; TONGA-NEXT: v_readfirstlane_b32 s4, v5 +; TONGA-NEXT: v_readfirstlane_b32 s5, v6 +; TONGA-NEXT: s_xor_b32 s12, s3, s0 +; TONGA-NEXT: s_xor_b32 s0, s4, s1 +; TONGA-NEXT: s_xor_b32 s1, s5, s2 +; TONGA-NEXT: s_sub_i32 s2, 0, s13 +; TONGA-NEXT: s_ashr_i32 s18, s0, 31 +; TONGA-NEXT: s_sub_i32 s0, 0, s14 +; TONGA-NEXT: s_ashr_i32 s19, s1, 31 +; TONGA-NEXT: s_sub_i32 s1, 0, s15 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v3, v3 +; TONGA-NEXT: v_mul_lo_u32 v4, s2, v0 +; TONGA-NEXT: v_mul_lo_u32 v5, s0, v1 +; TONGA-NEXT: v_mul_lo_u32 v6, s1, v2 +; TONGA-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 +; TONGA-NEXT: v_cvt_u32_f32_e32 v3, v3 +; TONGA-NEXT: v_mul_hi_u32 v4, v0, v4 +; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 +; TONGA-NEXT: v_mul_hi_u32 v6, v2, v6 +; TONGA-NEXT: s_sub_i32 s20, 0, s17 +; TONGA-NEXT: v_readfirstlane_b32 s7, v7 +; TONGA-NEXT: s_abs_i32 s3, s3 +; TONGA-NEXT: s_abs_i32 s4, s4 +; TONGA-NEXT: s_abs_i32 s5, s5 +; TONGA-NEXT: v_mul_lo_u32 v7, s20, v3 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6 +; TONGA-NEXT: v_mul_hi_u32 v0, s3, v0 +; TONGA-NEXT: v_mul_hi_u32 v1, s4, v1 +; TONGA-NEXT: v_mul_hi_u32 v2, s5, v2 +; TONGA-NEXT: v_mul_hi_u32 v7, v3, v7 +; TONGA-NEXT: v_mul_lo_u32 v4, v0, s13 +; TONGA-NEXT: v_mul_lo_u32 v6, v1, s14 +; TONGA-NEXT: v_mul_lo_u32 v8, v2, s15 +; TONGA-NEXT: s_abs_i32 s16, s7 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7 +; TONGA-NEXT: v_mul_hi_u32 v3, s16, v3 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s3, v4 +; TONGA-NEXT: v_sub_u32_e32 v6, vcc, s4, v6 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s5, v8 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v0 +; TONGA-NEXT: v_add_u32_e32 v7, vcc, 1, v1 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v2 +; TONGA-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 +; TONGA-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v6 +; TONGA-NEXT: v_cmp_le_u32_e64 s[4:5], s15, v8 +; TONGA-NEXT: v_subrev_u32_e32 v10, vcc, s13, v4 +; TONGA-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1] +; TONGA-NEXT: v_subrev_u32_e32 v5, vcc, s14, v6 +; TONGA-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[2:3] +; TONGA-NEXT: v_subrev_u32_e32 v7, vcc, s15, v8 +; TONGA-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[4:5] +; TONGA-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] +; TONGA-NEXT: v_add_u32_e32 v9, vcc, 1, v0 +; TONGA-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] +; TONGA-NEXT: v_add_u32_e32 v6, vcc, 1, v1 +; TONGA-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[4:5] +; TONGA-NEXT: v_add_u32_e32 v8, vcc, 1, v2 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 +; TONGA-NEXT: v_mul_lo_u32 v4, v3, s17 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 +; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 +; TONGA-NEXT: s_ashr_i32 s12, s12, 31 +; TONGA-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; TONGA-NEXT: v_xor_b32_e32 v0, s12, v0 +; TONGA-NEXT: v_xor_b32_e32 v1, s18, v1 +; TONGA-NEXT: v_xor_b32_e32 v2, s19, v2 +; TONGA-NEXT: v_sub_u32_e32 v4, vcc, s16, v4 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, s12, v0 +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s18, v1 +; TONGA-NEXT: v_subrev_u32_e32 v2, vcc, s19, v2 +; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 +; TONGA-NEXT: v_subrev_u32_e32 v6, vcc, s17, v4 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; TONGA-NEXT: v_add_u32_e32 v5, vcc, 1, v3 +; TONGA-NEXT: s_xor_b32 s0, s7, s6 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s17, v4 +; TONGA-NEXT: s_ashr_i32 s0, s0, 31 +; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; TONGA-NEXT: v_xor_b32_e32 v3, s0, v3 +; TONGA-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 +; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; TONGA-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v4i32: @@ -2002,7 +2014,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 25 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm @@ -2049,7 +2061,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v0 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v1, v0 +; TONGA-NEXT: v_subrev_u32_e32 v0, vcc, v0, v1 ; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 25 ; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; TONGA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll index 49dec15..584d26e 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffle.ll @@ -42,34 +42,35 @@ define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1) ; GFX11-FAKE16-LABEL: v_sext_in_reg_i8_i16_shuffle_vector: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[2:3] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v2, 24, v1 +; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 24, v1 ; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v5, 24, v0 ; GFX11-FAKE16-NEXT: v_ashrrev_i16 v6, 8, v1 ; GFX11-FAKE16-NEXT: v_bfe_i32 v7, v0, 0, 8 ; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0 ; GFX11-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-FAKE16-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX11-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8 ; GFX11-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v7, v7 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v6, v6 ; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v5, v5 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v8, v2 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v2, v4 -; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v4, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v8, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v4, v4 +; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v9, v2 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v3, v0, v7 ; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v6, v1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v5, v2 -; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v8, v4 -; GFX11-FAKE16-NEXT: global_store_b128 v9, v[0:3], s[0:1] +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v5, v4 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v8, v9 +; GFX11-FAKE16-NEXT: global_store_b128 v10, v[0:3], s[0:1] ; GFX11-FAKE16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll index 7aa7342..28330bf 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -681,30 +681,63 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in ; ; EG-LABEL: shl_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 3, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 10, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T8.X, 1 +; EG-NEXT: ALU 42, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T8.XYZW, T0.X, 0, #1 +; EG-NEXT: VTX_READ_128 T10.XYZW, T0.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, T6.X, +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: LSHR T1.W, T8.Z, literal.x, -; EG-NEXT: LSHR * T2.W, T8.X, literal.x, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: AND_INT * T1.W, T10.Z, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T1.W, T10.X, PV.W, +; EG-NEXT: AND_INT T1.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T2.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T1.W, PS, PV.W, +; EG-NEXT: MOV * T6.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: LSHR T1.W, T10.Z, literal.x, +; EG-NEXT: LSHR * T2.W, T10.X, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHL T0.Y, PS, PV.W, -; EG-NEXT: AND_INT T1.W, T8.Z, literal.x, -; EG-NEXT: AND_INT * T2.W, T8.X, literal.x, +; EG-NEXT: LSHL T1.W, PS, PV.W, +; EG-NEXT: AND_INT * T2.W, PV.X, literal.x, ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) -; EG-NEXT: LSHL T0.X, PS, PV.W, +; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, +; EG-NEXT: MOV T6.X, PV.W, +; EG-NEXT: MOV * T0.X, T7.X, +; EG-NEXT: AND_INT * T1.W, T10.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL T1.W, T10.Y, PV.W, +; EG-NEXT: AND_INT * T2.W, T0.X, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: AND_INT * T1.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.W, T2.W, PV.W, +; EG-NEXT: MOV * T7.X, PV.W, +; EG-NEXT: MOV T0.X, PV.X, +; EG-NEXT: LSHR T1.W, T10.W, literal.x, +; EG-NEXT: LSHR * T2.W, T10.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: LSHL * T1.W, PS, PV.W, +; EG-NEXT: AND_INT T0.Z, T0.X, literal.x, +; EG-NEXT: LSHL T1.W, PV.W, literal.y, ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: LSHR * T8.X, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44) +; EG-NEXT: LSHR T0.X, PS, literal.x, +; EG-NEXT: OR_INT * T10.Y, PV.Z, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T7.X, PV.Y, +; EG-NEXT: MOV * T10.X, T6.X, %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir b/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir index 7852f5d..23b24a2 100644 --- a/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir +++ b/llvm/test/CodeGen/AMDGPU/si-fold-reg-sequence.mir @@ -1,11 +1,23 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-fold-operands -verify-machineinstrs -o - %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s +# Check that we don't hang on this. --- name: fold_reg_sequence body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 + ; CHECK-LABEL: name: fold_reg_sequence + ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 429 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1 + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 $vgpr2, [[REG_SEQUENCE]].sub0, implicit $exec + ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32), addrspace 1) + ; CHECK-NEXT: [[V_MUL_HI_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MUL_HI_U32_e64 [[GLOBAL_LOAD_DWORD]], [[REG_SEQUENCE]].sub0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 %0:sreg_32 = S_MOV_B32 0 %1:sreg_32 = S_MOV_B32 429 %2:sreg_64 = REG_SEQUENCE killed %1, %subreg.sub0, %0, %subreg.sub1 @@ -13,6 +25,20 @@ body: | %4:vgpr_32 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec :: (load (s32), addrspace 1) %5:vgpr_32 = V_MUL_HI_U32_e64 %4, %2.sub0, implicit $exec S_ENDPGM 0 - ... +# Fold through a COPY of REG_SEQUENCE. +--- +name: fold_through_copy +body: | + bb.0: + ; CHECK-LABEL: name: fold_through_copy + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK-NEXT: [[V_PK_ADD_F32_:%[0-9]+]]:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, [[DEF]], 8, 0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %0:sreg_32 = S_MOV_B32 0 + %1:sreg_64 = REG_SEQUENCE %0:sreg_32, %subreg.sub0, %0:sreg_32, %subreg.sub1 + %2:sreg_64_xexec = IMPLICIT_DEF + %3:vreg_64_align2 = COPY %1:sreg_64 + %4:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %2:sreg_64_xexec, 8, %3:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll index 5d169c1..80c0d0f 100644 --- a/llvm/test/CodeGen/AMDGPU/sra.ll +++ b/llvm/test/CodeGen/AMDGPU/sra.ll @@ -320,28 +320,67 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i ; ; EG-LABEL: ashr_v4i16: ; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[] ; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XY, T8.X, 1 +; EG-NEXT: ALU 48, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1 ; EG-NEXT: CF_END ; EG-NEXT: PAD ; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1 +; EG-NEXT: VTX_READ_128 T9.XYZW, T9.X, 0, #1 ; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T7.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: LSHR T0.Z, T7.X, literal.x, -; EG-NEXT: BFE_INT T0.W, T7.X, 0.0, literal.x, -; EG-NEXT: AND_INT * T1.W, T7.Z, literal.y, +; EG-NEXT: MOV * T0.Y, T6.X, +; EG-NEXT: MOV * T9.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: BFE_INT T0.W, T9.X, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.Z, literal.y, ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) -; EG-NEXT: ASHR T7.X, PV.W, PS, -; EG-NEXT: BFE_INT T0.W, PV.Z, 0.0, literal.x, -; EG-NEXT: LSHR * T1.W, T7.Z, literal.x, +; EG-NEXT: ASHR * T0.W, PV.W, PS, +; EG-NEXT: AND_INT T0.W, PV.W, literal.x, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), -65536(nan) +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: MOV * T6.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T9.X, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T1.W, T9.Z, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ASHR T0.W, PV.W, PS, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV T6.X, PV.W, +; EG-NEXT: MOV T0.Y, T7.X, +; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x, +; EG-NEXT: AND_INT * T1.W, T9.W, literal.y, +; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41) +; EG-NEXT: ASHR T0.W, PV.W, PS, +; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x, +; EG-NEXT: -65536(nan), 0(0.000000e+00) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: OR_INT * T0.W, T1.W, PV.W, +; EG-NEXT: MOV * T7.X, PV.W, +; EG-NEXT: MOV T0.Y, PV.X, +; EG-NEXT: LSHR * T0.W, T9.Y, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x, +; EG-NEXT: LSHR * T1.W, T9.W, literal.x, +; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; EG-NEXT: ASHR T0.W, PV.W, PS, +; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x, +; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x, -; EG-NEXT: ASHR * T7.Y, PV.W, PS, +; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T10.Y, T1.W, PV.W, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: MOV T7.X, PV.Y, +; EG-NEXT: MOV * T10.X, T6.X, %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in, i16 1 %a = load <4 x i16>, ptr addrspace(1) %in %b = load <4 x i16>, ptr addrspace(1) %b_ptr diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 141b86a..4a6202ea 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -493,9 +493,9 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX11-GISEL-TRUE16-LABEL: v_constained_fsub_v3f16_fpexcept_strict: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l ; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l ; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.h, v0.h, v2.h +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: v_constained_fsub_v3f16_fpexcept_strict: diff --git a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll index 84fe4ec..98d0a62 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll @@ -299,13 +299,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a, ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0x7fff ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v1, 0xffff8000, v6, v0 -; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v0 -; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v5, 0xffff8000, v5, v0 -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v1, 0xffff8000, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v5, v0 +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v5, 0xffff8000, v6, v0 +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v1.l ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v4.l -; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v2.h, v5.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v2.h, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v5.l ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index ed2f06b..b5d9d00 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -3963,8 +3963,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4067,8 +4067,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4175,8 +4175,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -4283,8 +4283,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4387,8 +4387,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4502,8 +4502,8 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index 8812cae..2a989ec 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -3963,8 +3963,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4067,8 +4067,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4175,8 +4175,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -4283,8 +4283,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4387,8 +4387,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4502,8 +4502,8 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index 82eb122..69fd58a 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -3839,8 +3839,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -3943,8 +3943,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -4051,8 +4051,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -4159,8 +4159,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4263,8 +4263,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4378,8 +4378,8 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 115b05a..aab0e76 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -3585,8 +3585,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -3689,8 +3689,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc @@ -3797,8 +3797,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc @@ -3905,8 +3905,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4009,8 +4009,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4124,8 +4124,8 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir index 2032b98..fa3b9244 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir @@ -834,6 +834,222 @@ body: | ... --- +name: test_wmma_scale_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr81, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr81, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr81, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr81, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale16_F3216_16x16x128_F8F6F4_F8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale16_F3216_16x16x128_F8F6F4_F8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale16_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale16_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale16_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale16_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 1, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale16_F3216_16x16x128_F8F6F4_F6f4_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale16_F3216_16x16x128_F8F6F4_F6f4_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale16_F32_16x16x128_F8F6F4_F6f4_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale16_F32_16x16x128_F8F6F4_F6f4_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE16_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41, killed $vgpr42_vgpr43, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_scale16_F32_16x16x128_F8F6F4_F6f4_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_scale16_F32_16x16x128_F8F6F4_F6f4_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40, killed $vgpr41, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec +... + +--- name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1 body: | bb.0: |