diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
26 files changed, 2560 insertions, 308 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 8a80afd..fa0e4b9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -257,20 +257,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: v_mov_b32_e32 v2, 15 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -357,20 +353,16 @@ define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; UNALIGNED_GFX12: ; %bb.0: ; %bb ; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, s0 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: @@ -937,19 +929,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_add_co_u32 s0, 0x100, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -1048,19 +1038,17 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x100, s0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: @@ -1579,19 +1567,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -1692,19 +1678,17 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 -; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, s0 offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: @@ -4060,9 +4044,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -4113,9 +4095,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr a ; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset: ; UNALIGNED_GFX12: ; %bb.0: ; %bb ; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:65512 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: @@ -4172,9 +4152,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS +; GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_endpgm ; @@ -4223,9 +4201,7 @@ define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(pt ; UNALIGNED_GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset: ; UNALIGNED_GFX12: ; %bb.0: ; %bb ; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_add_nc_u32 v0, s1, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, off offset:-16 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir index be3fe91..4f5f52b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memcpy_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memcpy_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMCPY %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir index a82ca30..0392aef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memcpyinline_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memcpyinline_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMCPY_INLINE %2:_(p0), %5:_(p0), %7:_(s64) :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir index e7cfaab..1f8d1aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memmove_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memmove_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMMOVE %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir index 021cebb..dda94e15 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir @@ -30,3 +30,32 @@ body: | S_ENDPGM 0 ... +--- +name: memset_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: memset_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8) + ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s16) = G_TRUNC %3:_(s32) + %5:_(s8) = G_TRUNC %4:_(s16) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMSET %2:_(p0), %5:_(s8), %7:_(s64), 0 :: (volatile store (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll new file mode 100644 index 0000000..b992506 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/add-max.ll @@ -0,0 +1,295 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: add_max_u32_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_max_u32_e32 v0, v0, v2 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, %b + %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) { +; GCN-LABEL: add_max_u32_svv: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_max_u32_e32 v0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, %b + %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) { +; GCN-LABEL: add_max_u32_ssv: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_co_i32 s0, s0, s1 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_max_u32_e32 v0, s0, v0 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, %b + %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) { +; GCN-LABEL: add_max_u32_sss: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_co_i32 s0, s0, s1 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_max_u32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, %b + %max = call i32 @llvm.umax.i32(i32 %add, i32 %c) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) { +; GCN-LABEL: add_max_u32_vsi: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_max_u32_e32 v0, 4, v0 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, %b + %max = call i32 @llvm.umax.i32(i32 %add, i32 4) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) { +; GCN-LABEL: add_max_u32_svl: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_max_u32_e32 v0, 0x64, v0 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, %b + %max = call i32 @llvm.umax.i32(i32 %add, i32 100) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) { +; GCN-LABEL: add_max_u32_slv: +; GCN: ; %bb.0: +; GCN-NEXT: s_addk_co_i32 s0, 0x64 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_max_u32_e32 v0, s0, v0 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, 100 + %max = call i32 @llvm.umax.i32(i32 %add, i32 %b) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: add_max_i32_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_max_i32_e32 v0, v0, v2 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, %b + %max = call i32 @llvm.smax.i32(i32 %add, i32 %c) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: add_min_u32_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_min_u32_e32 v0, v0, v2 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, %b + %max = call i32 @llvm.umin.i32(i32 %add, i32 %c) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) { +; GCN-LABEL: add_min_i32_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_nc_u32_e32 v0, v0, v1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_min_i32_e32 v0, v0, v2 +; GCN-NEXT: ; return to shader part epilog + %add = add i32 %a, %b + %max = call i32 @llvm.smin.i32(i32 %add, i32 %c) + %ret = bitcast i32 %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; GCN-LABEL: add_max_v2u16_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_max_u16 v0, v0, v1, v2 +; GCN-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, %b + %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x i16> %c) { +; GCN-LABEL: add_max_v2u16_svv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, %b + %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) { +; SDAG-LABEL: add_max_v2u16_ssv: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_pk_add_max_u16 v0, s0, s1, v0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: add_max_v2u16_ssv: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_lshr_b32 s2, s0, 16 +; GISEL-NEXT: s_lshr_b32 s3, s1, 16 +; GISEL-NEXT: s_add_co_i32 s0, s0, s1 +; GISEL-NEXT: s_add_co_i32 s2, s2, s3 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GISEL-NEXT: v_pk_max_u16 v0, s0, v0 +; GISEL-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, %b + %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) { +; SDAG-LABEL: add_max_v2u16_sss: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_pk_add_u16 v0, s0, s1 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: v_pk_max_u16 v0, v0, s2 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: add_max_v2u16_sss: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_lshr_b32 s3, s0, 16 +; GISEL-NEXT: s_lshr_b32 s4, s1, 16 +; GISEL-NEXT: s_add_co_i32 s0, s0, s1 +; GISEL-NEXT: s_add_co_i32 s3, s3, s4 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GISEL-NEXT: s_and_b32 s3, s2, 0xffff +; GISEL-NEXT: s_lshr_b32 s1, s0, 16 +; GISEL-NEXT: s_and_b32 s0, s0, 0xffff +; GISEL-NEXT: s_lshr_b32 s2, s2, 16 +; GISEL-NEXT: s_max_u32 s0, s0, s3 +; GISEL-NEXT: s_max_u32 s1, s1, s2 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, %b + %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) { +; GCN-LABEL: add_max_v2u16_vsi: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_max_u16 v0, v0, s0, 4 +; GCN-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, %b + %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) { +; GCN-LABEL: add_max_v2u16_svl: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_max_u16 v0, s0, v0, 0x650064 +; GCN-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, %b + %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) { +; SDAG-LABEL: add_max_v2u16_slv: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_pk_add_max_u16 v0, 0x640064, s0, v0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: add_max_v2u16_slv: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_lshr_b32 s1, s0, 16 +; GISEL-NEXT: s_add_co_i32 s0, s0, 0x640064 +; GISEL-NEXT: s_addk_co_i32 s1, 0x64 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GISEL-NEXT: v_pk_max_u16 v0, s0, v0 +; GISEL-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, <i16 100, i16 100> + %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; GCN-LABEL: add_max_v2s16_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_max_i16 v0, v0, v1, v2 +; GCN-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, %b + %max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; GCN-LABEL: add_min_v2u16_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_min_u16 v0, v0, v1, v2 +; GCN-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, %b + %max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; GCN-LABEL: add_min_v2s16_vvv: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_min_i16 v0, v0, v1, v2 +; GCN-NEXT: ; return to shader part epilog + %add = add <2 x i16> %a, %b + %max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c) + %ret = bitcast <2 x i16> %max to float + ret float %ret +} + +declare <2 x i16> @llvm.smin.v216(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.smax.v216(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umin.v216(<2 x i16>, <2 x i16>) +declare <2 x i16> @llvm.umax.v216(<2 x i16>, <2 x i16>) +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 5b4866c..6823a47 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX-942 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GCN,GFX-950 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GFX1250 %s ; TODO: Add global-isel when it can support bf16 @@ -9,6 +10,11 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) { ; GCN: ; %bb.0: ; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: v_test_cvt_bf16_f32_v: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: ; return to shader part epilog %cvt = fpext bfloat %v to float ret float %cvt } @@ -19,6 +25,13 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) { ; GCN-NEXT: s_lshl_b32 s0, s0, 16 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: v_test_cvt_bf16_f32_s: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_lshl_b32 s0, s0, 16 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ; return to shader part epilog %cvt = fpext bfloat %v to float ret float %cvt } @@ -47,6 +60,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) { ; GFX-950: ; %bb.0: ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 ; GFX-950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_v: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: ; return to shader part epilog %res = fptrunc <2 x float> %src to <2 x bfloat> %cast = bitcast <2 x bfloat> %res to float ret float %cast @@ -80,6 +98,11 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) { ; GFX-950-NEXT: v_mov_b32_e32 v0, s1 ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, s0, v0 ; GFX-950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_s: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, s0, s1 +; GFX1250-NEXT: ; return to shader part epilog %res = fptrunc <2 x float> %src to <2 x bfloat> %cast = bitcast <2 x bfloat> %res to float ret float %cast @@ -103,6 +126,13 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) { ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX-950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: v_test_cvt_f32_bf16_v: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: ; return to shader part epilog %trunc = fptrunc float %src to bfloat %ext = fpext bfloat %trunc to float ret float %ext @@ -172,6 +202,38 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4 ; GFX-950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: v_test_cvt_v2f64_v2bf16_v: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_cvt_f32_f64_e32 v8, v[2:3] +; GFX1250-NEXT: v_cvt_f32_f64_e32 v9, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[2:3]|, |v[4:5]| +; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[2:3], v[4:5] +; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, v[0:1], v[6:7] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cndmask_b32_e64 v2, -1, 1, s1 +; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[6:7]| +; GFX1250-NEXT: v_dual_add_nc_u32 v1, v8, v2 :: v_dual_bitop2_b32 v10, 1, v8 bitop3:0x40 +; GFX1250-NEXT: s_wait_alu 0xf1ff +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1 +; GFX1250-NEXT: v_and_b32_e32 v11, 1, v9 +; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, 1, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_add_nc_u32_e32 v0, v9, v0 +; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, 1, v11 +; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo +; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0 +; GFX1250-NEXT: s_wait_alu 0xfffe +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: ; return to shader part epilog %res = fptrunc <2 x double> %src to <2 x bfloat> %cast = bitcast <2 x bfloat> %res to float ret float %cast @@ -201,6 +263,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) { ; GFX-950: ; %bb.0: ; %entry ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 ; GFX-950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: ; return to shader part epilog entry: %a.cvt = fptrunc float %a to bfloat %b.cvt = fptrunc float %b to bfloat @@ -236,6 +303,11 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) { ; GFX-950: ; %bb.0: ; %entry ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1| ; GFX-950-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16_mods: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, |v1| +; GFX1250-NEXT: ; return to shader part epilog entry: %a.neg = fneg float %a %a.cvt = fptrunc float %a.neg to bfloat @@ -269,6 +341,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm +; +; GFX1250-LABEL: fptrunc_f32_to_bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: s_endpgm entry: %a.cvt = fptrunc float %a to bfloat store bfloat %a.cvt, ptr %out @@ -298,6 +377,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm +; +; GFX1250-LABEL: fptrunc_f32_to_bf16_abs: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: s_endpgm entry: %a.abs = call float @llvm.fabs.f32(float %a) %a.cvt = fptrunc float %a.abs to bfloat @@ -328,6 +414,13 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm +; +; GFX1250-LABEL: fptrunc_f32_to_bf16_neg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: s_endpgm entry: %a.neg = fneg float %a %a.cvt = fptrunc float %a.neg to bfloat @@ -373,6 +466,24 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm +; +; GFX1250-LABEL: fptrunc_f64_to_bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX1250-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]| +; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s0 +; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v7 +; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: s_endpgm entry: %a.cvt = fptrunc double %a to bfloat store bfloat %a.cvt, ptr %out @@ -417,6 +528,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm +; +; GFX1250-LABEL: fptrunc_f64_to_bf16_neg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]| +; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, -v[0:1], v[4:5] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1 +; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: s_endpgm entry: %a.neg = fneg double %a %a.cvt = fptrunc double %a.neg to bfloat @@ -462,6 +592,25 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm +; +; GFX1250-LABEL: fptrunc_f64_to_bf16_abs: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX1250-NEXT: v_cmp_gt_f64_e64 s1, |v[0:1]|, |v[4:5]| +; GFX1250-NEXT: v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1 +; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX1250-NEXT: s_or_b32 vcc_lo, s0, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: s_endpgm entry: %a.abs = call double @llvm.fabs.f64(double %a) %a.cvt = fptrunc double %a.abs to bfloat diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll index d103423..9550405 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll @@ -145,12 +145,13 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 { ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS +; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS ; GCN-NEXT: s_wait_loadcnt 0x0 ; GCN-NEXT: s_wait_xcnt 0x0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] ; GCN-NEXT: s_mov_b32 s0, exec_lo ; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 4cb0d2d..e6c38d2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1046,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2667,28 +2659,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -2699,7 +2684,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -3238,10 +3222,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll new file mode 100644 index 0000000..85e7038 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.bf16.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefix=GCN %s + +define float @test_canonicalize_amdgcn_tanh_f32(float %a) { +; GCN-LABEL: test_canonicalize_amdgcn_tanh_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_tanh_f32_e32 v0, v0 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %tanh = call float @llvm.amdgcn.tanh.f32(float %a) + %canonicalized = call float @llvm.canonicalize.f32(float %tanh) + ret float %canonicalized +} + +define bfloat @test_canonicalize_amdgcn_tanh_bf16(bfloat %a) { +; GCN-LABEL: test_canonicalize_amdgcn_tanh_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_tanh_bf16_e32 v0, v0 +; GCN-NEXT: v_nop +; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GCN-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat %a) + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %tanh) + ret bfloat %canonicalized +} + +define half @test_canonicalize_amdgcn_tanh_f16(half %a) { +; GCN-LABEL: test_canonicalize_amdgcn_tanh_f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_tanh_f16_e32 v0, v0 +; GCN-NEXT: s_set_pc_i64 s[30:31] + %tanh = call half @llvm.amdgcn.tanh.f16(half %a) + %canonicalized = call half @llvm.canonicalize.f16(half %tanh) + ret half %canonicalized +} diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir new file mode 100644 index 0000000..ea1ae04 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir @@ -0,0 +1,18 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -passes=finalizebundle-test %s -o - | FileCheck %s + +--- +name: test_overlap +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: test_overlap + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: BUNDLE implicit-def $vgpr2_vgpr3, implicit-def $vgpr2, implicit-def $vgpr2_lo16, implicit-def $vgpr2_hi16, implicit-def $vgpr3, implicit-def $vgpr3_lo16, implicit-def $vgpr3_hi16, implicit-def $vgpr3_vgpr4, implicit-def $vgpr4, implicit-def $vgpr4_lo16, implicit-def $vgpr4_hi16, implicit $vgpr0_vgpr1, implicit $exec, implicit $vgpr1_vgpr2 { + ; CHECK-NEXT: $vgpr2_vgpr3 = V_LSHLREV_B64_pseudo_e32 1, $vgpr0_vgpr1, implicit $exec + ; CHECK-NEXT: $vgpr3_vgpr4 = V_LSHLREV_B64_pseudo_e32 1, $vgpr1_vgpr2, implicit $exec + ; CHECK-NEXT: } + $vgpr2_vgpr3 = V_LSHLREV_B64_pseudo_e32 1, $vgpr0_vgpr1, implicit $exec + $vgpr3_vgpr4 = V_LSHLREV_B64_pseudo_e32 1, $vgpr1_vgpr2, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index f54fbba..e6018e4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -95,12 +95,24 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) { } define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) { -; GFX1250-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, 0xff800000 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388607 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, -1 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967295 %load = load i8, ptr %gep0 %zext = zext i8 %load to i32 @@ -551,12 +563,21 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff ; Both 64-bit base and 32-bit offset are scalar define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) { -; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %load = load i8, ptr %gep0 @@ -567,12 +588,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, ; Both 64-bit base and 32-bit offset are scalar, with immediate offset. define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) { -; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] offset:-24 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -24 @@ -584,12 +614,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr ; Both components uniform, zext forced to LHS of addressing expression define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) { -; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint ptr %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -602,12 +641,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in ; Both components uniform, zext forced to LHS of addressing expression, with immediate offset define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) { -; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: -; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b32_e32 v0, s4 -; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 -; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: ; return to shader part epilog +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] offset:128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 %sbase.as.int = ptrtoint ptr %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -686,33 +734,13 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3 ; Cannot push the shift into 32-bits, and cannot match. define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) { -; GFX1250-SDAG-LABEL: flat_load_saddr_f32_natural_addressing: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3] -; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_f32_natural_addressing: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %voffset = load i32, ptr %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset @@ -743,8 +771,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, pt ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{} @@ -760,8 +787,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_load_b32 v0, v[0:1] ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400 scale_offset ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{} @@ -774,33 +800,13 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg ; Range is 1 beyond the limit where we can move the shift into 32-bits. define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) { -; GFX1250-SDAG-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3] -; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %voffset = load i32, ptr %voffset.ptr, !range !1, !noundef !{} %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index a98df5c..b0e6752 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -150,13 +150,11 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -321,15 +319,14 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -494,15 +491,14 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -664,17 +660,15 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX12-GISEL-LABEL: soff2_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -850,13 +844,11 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1032,13 +1024,11 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1200,17 +1190,15 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX12-GISEL-LABEL: soff4_voff1: ; GFX12-GISEL: ; %bb.0: ; %bb ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1386,13 +1374,11 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 2, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1565,13 +1551,11 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX12-GISEL-NEXT: v_mul_u32_u24_e32 v0, 4, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 ; GFX12-GISEL-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v2, s0 offset:2 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v3, s0 offset:4 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: @@ -1672,9 +1656,7 @@ define amdgpu_kernel void @soff1_voff1_negative(i32 %soff) { ; GFX12-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_and_b32 v0, 0x3ff, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS +; GFX12-GISEL-NEXT: scratch_store_b8 v0, v1, s0 offset:-1 scope:SCOPE_SYS ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0 ; GFX12-GISEL-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll index 5d35adc..79907fd 100644 --- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll @@ -482,17 +482,16 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-SDAG-LABEL: test_v7i16_load_store_kernel: ; GCN-SDAG: ; %bb.0: ; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GCN-SDAG-NEXT: s_wait_xcnt 0x0 ; GCN-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 8 ; GCN-SDAG-NEXT: v_mov_b64_e32 v[12:13], 0 -; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_clause 0x1 -; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] -; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset +; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset ; GCN-SDAG-NEXT: s_wait_loadcnt 0x0 ; GCN-SDAG-NEXT: v_pk_add_u16 v3, v3, v7 ; GCN-SDAG-NEXT: v_pk_add_u16 v2, v2, v6 @@ -509,21 +508,20 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ; GCN-GISEL-LABEL: test_v7i16_load_store_kernel: ; GCN-GISEL: ; %bb.0: ; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GCN-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GCN-GISEL-NEXT: s_wait_xcnt 0x0 ; GCN-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4 -; GCN-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 8 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 10 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: s_clause 0x1 -; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] -; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] +; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset +; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset ; GCN-GISEL-NEXT: s_wait_loadcnt 0x0 ; GCN-GISEL-NEXT: v_pk_add_u16 v0, v0, v4 ; GCN-GISEL-NEXT: v_pk_add_u16 v1, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 355f77a..af914bd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_movk_i32 s4, 0xfc01 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 -; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 ; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; SI-NEXT: v_not_b32_e32 v5, v5 diff --git a/llvm/test/CodeGen/AMDGPU/max3.ll b/llvm/test/CodeGen/AMDGPU/max3.ll index a757bb0..b922854 100644 --- a/llvm/test/CodeGen/AMDGPU/max3.ll +++ b/llvm/test/CodeGen/AMDGPU/max3.ll @@ -1,6 +1,7 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250,GFX9_1250 %s ; GCN-LABEL: {{^}}v_test_imax3_sgt_i32: ; GCN: v_max3_i32 @@ -46,7 +47,7 @@ define amdgpu_kernel void @v_test_umax3_ugt_i32(ptr addrspace(1) %out, ptr addrs ; VI: v_max_i16 ; VI: v_max_i16 -; GFX9: v_max3_i16 +; GFX9_1250: v_max3_i16 define amdgpu_kernel void @v_test_imax3_sgt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -70,7 +71,7 @@ define amdgpu_kernel void @v_test_imax3_sgt_i16(ptr addrspace(1) %out, ptr addrs ; VI: v_max_u16 ; VI: v_max_u16 -; GFX9: v_max3_u16 +; GFX9_1250: v_max3_u16 define amdgpu_kernel void @v_test_umax3_ugt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -94,7 +95,7 @@ define amdgpu_kernel void @v_test_umax3_ugt_i16(ptr addrspace(1) %out, ptr addrs ; VI: v_max_i16 ; VI: v_max_i16 -; GFX9: v_max3_i16 +; GFX9_1250: v_max3_i16 define amdgpu_kernel void @v_test_imax3_sgt_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid @@ -118,7 +119,7 @@ define amdgpu_kernel void @v_test_imax3_sgt_i8(ptr addrspace(1) %out, ptr addrsp ; VI: v_max_u16 ; VI: v_max_u16 -; GFX9: v_max3_u16 +; GFX9_1250: v_max3_u16 define amdgpu_kernel void @v_test_umax3_ugt_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid @@ -142,7 +143,7 @@ define amdgpu_kernel void @v_test_umax3_ugt_i8(ptr addrspace(1) %out, ptr addrsp ; VI: v_max_i16 ; VI: v_max_i16 -; GFX9: v_max3_i16 +; GFX9_1250: v_max3_i16 define amdgpu_kernel void @v_test_imax3_sgt_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid @@ -166,7 +167,7 @@ define amdgpu_kernel void @v_test_imax3_sgt_i7(ptr addrspace(1) %out, ptr addrsp ; VI: v_max_u16 ; VI: v_max_u16 -; GFX9: v_max3_u16 +; GFX9_1250: v_max3_u16 define amdgpu_kernel void @v_test_umax3_ugt_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid @@ -260,6 +261,50 @@ define amdgpu_kernel void @v_test_umax3_ugt_i64(ptr addrspace(1) %out, ptr addrs ret void } +; GCN-LABEL: {{^}}v_test_imax3_sgt_v2i16: +; SI-COUNT-2: v_max3_i32 +; VI-COUNT-2: v_max_i16 +; GFX9-COUNT-2: v_pk_max_i16 +; GFX1250: v_pk_max3_i16 +define amdgpu_kernel void @v_test_imax3_sgt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid + %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid + %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %cptr, i32 %tid + %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid + %a = load <2 x i16>, ptr addrspace(1) %gep0 + %b = load <2 x i16>, ptr addrspace(1) %gep1 + %c = load <2 x i16>, ptr addrspace(1) %gep2 + %icmp0 = icmp sgt <2 x i16> %a, %b + %i0 = select <2 x i1> %icmp0, <2 x i16> %a, <2 x i16> %b + %icmp1 = icmp sgt <2 x i16> %i0, %c + %i1 = select <2 x i1> %icmp1, <2 x i16> %i0, <2 x i16> %c + store <2 x i16> %i1, ptr addrspace(1) %out + ret void +} + +; GCN-LABEL: {{^}}v_test_imax3_ugt_v2i16: +; SI-COUNT-2: v_max3_u32 +; VI-COUNT-2: v_max_u16 +; GFX9-COUNT-2: v_pk_max_u16 +; GFX1250: v_pk_max3_u16 +define amdgpu_kernel void @v_test_imax3_ugt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr <2 x i16>, ptr addrspace(1) %aptr, i32 %tid + %gep1 = getelementptr <2 x i16>, ptr addrspace(1) %bptr, i32 %tid + %gep2 = getelementptr <2 x i16>, ptr addrspace(1) %cptr, i32 %tid + %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid + %a = load <2 x i16>, ptr addrspace(1) %gep0 + %b = load <2 x i16>, ptr addrspace(1) %gep1 + %c = load <2 x i16>, ptr addrspace(1) %gep2 + %icmp0 = icmp ugt <2 x i16> %a, %b + %i0 = select <2 x i1> %icmp0, <2 x i16> %a, <2 x i16> %b + %icmp1 = icmp ugt <2 x i16> %i0, %c + %i1 = select <2 x i1> %icmp1, <2 x i16> %i0, <2 x i16> %c + store <2 x i16> %i1, ptr addrspace(1) %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir index 173c9cc..417a4c5 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -run-pass=si-memory-legalizer %s -o - | FileCheck %s --- | @@ -39,12 +40,7 @@ ... --- -# CHECK-LABEL: name: atomic_max_i32_noret -# CHECK-LABEL: bb.1.atomic: -# CHECK: BUFFER_ATOMIC_SMAX_ADDR64 -# CHECK-NEXT: S_WAITCNT_soft 3952 -# CHECK-NEXT: BUFFER_WBINVL1_VOL name: atomic_max_i32_noret alignment: 1 @@ -71,6 +67,46 @@ frameInfo: hasVAStart: false hasMustTailInVarArgFunc: false body: | + ; CHECK-LABEL: name: atomic_max_i32_noret + ; CHECK: bb.0 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $sgpr0_sgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: $vgpr1 = V_ASHRREV_I32_e32 31, $vgpr0, implicit $exec + ; CHECK-NEXT: $vgpr1_vgpr2 = V_LSHL_B64_e64 $vgpr0_vgpr1, 3, implicit $exec + ; CHECK-NEXT: $sgpr7 = S_MOV_B32 61440 + ; CHECK-NEXT: $sgpr6 = S_MOV_B32 0 + ; CHECK-NEXT: S_WAITCNT 127 + ; CHECK-NEXT: $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 1, 0, implicit $exec :: (volatile load (s64) from %ir.tid.gep, addrspace 1) + ; CHECK-NEXT: S_WAITCNT_soft 3952 + ; CHECK-NEXT: $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec + ; CHECK-NEXT: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; CHECK-NEXT: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK-NEXT: $sgpr2_sgpr3 = S_XOR_B64 $exec, killed $sgpr2_sgpr3, implicit-def dead $scc + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.atomic: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000000C, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr1_vgpr2_vgpr3_vgpr4:0x0000000000000003 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 15, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: dead $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; CHECK-NEXT: dead $vgpr0 = V_MOV_B32_e32 61440, implicit $exec + ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 0 + ; CHECK-NEXT: S_WAITCNT 127 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; CHECK-NEXT: S_WAITCNT 3952 + ; CHECK-NEXT: S_WAITCNT_soft 3952 + ; CHECK-NEXT: BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst (s32) from %ir.gep, addrspace 1) + ; CHECK-NEXT: S_WAITCNT_soft 3952 + ; CHECK-NEXT: BUFFER_WBINVL1_VOL implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.exit: + ; CHECK-NEXT: liveins: $sgpr2_sgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec = S_OR_B64 $exec, killed $sgpr2_sgpr3, implicit-def $scc + ; CHECK-NEXT: S_ENDPGM 0 bb.0 (%ir-block.0): successors: %bb.1.atomic(0x40000000), %bb.2.exit(0x40000000) liveins: $vgpr0, $sgpr0_sgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir index e325071..064e3e0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir @@ -1,17 +1,65 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s --- -# GCN-LABEL: name: multiple_mem_operands -# GCN-LABEL: bb.3: -# GCN: S_WAITCNT_soft 3952 -# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_WAITCNT_soft 3952 -# GCN-NEXT: BUFFER_WBINVL1_VOL name: multiple_mem_operands body: | + ; GCN-LABEL: name: multiple_mem_operands + ; GCN: bb.0.entry: + ; GCN-NEXT: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN-NEXT: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN-NEXT: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN-NEXT: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) poison`, addrspace 5) + ; GCN-NEXT: S_WAITCNT 127 + ; GCN-NEXT: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GCN-NEXT: S_WAITCNT 3855 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) poison`, addrspace 5) + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: S_WAITCNT 3855 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; GCN-NEXT: S_WAITCNT 3855 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: liveins: $sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT 127 + ; GCN-NEXT: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GCN-NEXT: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GCN-NEXT: S_WAITCNT_soft 3952 + ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 1, 0, implicit $exec :: (load syncscope("agent-one-as") unordered (s32) from `ptr addrspace(1) poison`, addrspace 1), (load syncscope("workgroup-one-as") seq_cst (s32) from `ptr addrspace(5) poison`, addrspace 5) + ; GCN-NEXT: S_WAITCNT_soft 3952 + ; GCN-NEXT: BUFFER_WBINVL1_VOL implicit $exec + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GCN-NEXT: S_WAITCNT 3952 + ; GCN-NEXT: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr addrspace(1) poison`, addrspace 1) + ; GCN-NEXT: S_ENDPGM 0 bb.0.entry: successors: %bb.1(0x30000000), %bb.2(0x50000000) liveins: $sgpr0_sgpr1, $sgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/min3.ll b/llvm/test/CodeGen/AMDGPU/min3.ll index 0e25540..e30b929 100644 --- a/llvm/test/CodeGen/AMDGPU/min3.ll +++ b/llvm/test/CodeGen/AMDGPU/min3.ll @@ -1,6 +1,7 @@ ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VI %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_1250 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250,GFX9_1250 %s ; GCN-LABEL: {{^}}v_test_imin3_slt_i32: ; GCN: v_min3_i32 @@ -116,7 +117,7 @@ define amdgpu_kernel void @v_test_umin3_2_uses(ptr addrspace(1) %out, ptr addrsp ; VI: v_min_i16 ; VI: v_min_i16 -; GFX9: v_min3_i16 +; GFX9_1250: v_min3_i16 define amdgpu_kernel void @v_test_imin3_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -140,7 +141,7 @@ define amdgpu_kernel void @v_test_imin3_slt_i16(ptr addrspace(1) %out, ptr addrs ; VI: v_min_u16 ; VI: v_min_u16 -; GFX9: v_min3_u16 +; GFX9_1250: v_min3_u16 define amdgpu_kernel void @v_test_umin3_ult_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i16, ptr addrspace(1) %aptr, i32 %tid @@ -164,7 +165,7 @@ define amdgpu_kernel void @v_test_umin3_ult_i16(ptr addrspace(1) %out, ptr addrs ; VI: v_min_i16 ; VI: v_min_i16 -; GFX9: v_min3_i16 +; GFX9_1250: v_min3_i16 define amdgpu_kernel void @v_test_imin3_slt_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid @@ -188,7 +189,7 @@ define amdgpu_kernel void @v_test_imin3_slt_i8(ptr addrspace(1) %out, ptr addrsp ; VI: v_min_u16 ; VI: v_min_u16 -; GFX9: v_min3_u16 +; GFX9_1250: v_min3_u16 define amdgpu_kernel void @v_test_umin3_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i8, ptr addrspace(1) %aptr, i32 %tid @@ -212,7 +213,7 @@ define amdgpu_kernel void @v_test_umin3_ult_i8(ptr addrspace(1) %out, ptr addrsp ; VI: v_min_i16 ; VI: v_min_i16 -; GFX9: v_min3_i16 +; GFX9_1250: v_min3_i16 define amdgpu_kernel void @v_test_imin3_slt_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid @@ -236,7 +237,7 @@ define amdgpu_kernel void @v_test_imin3_slt_i7(ptr addrspace(1) %out, ptr addrsp ; VI: v_min_u16 ; VI: v_min_u16 -; GFX9: v_min3_u16 +; GFX9_1250: v_min3_u16 define amdgpu_kernel void @v_test_umin3_ult_i7(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr i7, ptr addrspace(1) %aptr, i32 %tid @@ -330,6 +331,50 @@ define amdgpu_kernel void @v_test_umin3_ult_i64(ptr addrspace(1) %out, ptr addrs ret void } +; GCN-LABEL: {{^}}v_test_imin3_slt_v2i16: +; SI-COUNT-2: v_min3_i32 +; VI-COUNT-2: v_min_i16 +; GFX9-COUNT-2: v_pk_min_i16 +; GFX1250: v_pk_min3_i16 +define amdgpu_kernel void @v_test_imin3_slt_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid + %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid + %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid + %a = load <2 x i16>, ptr addrspace(1) %gep0 + %b = load <2 x i16>, ptr addrspace(1) %gep1 + %c = load <2 x i16>, ptr addrspace(1) %gep2 + %icmp0 = icmp slt <2 x i16> %a, %b + %i0 = select <2 x i1> %icmp0, <2 x i16> %a, <2 x i16> %b + %icmp1 = icmp slt <2 x i16> %i0, %c + %i1 = select <2 x i1> %icmp1, <2 x i16> %i0, <2 x i16> %c + store <2 x i16> %i1, ptr addrspace(1) %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_imin3_ult_v2i16: +; SI-COUNT-2: v_min3_u32 +; VI-COUNT-2: v_min_u16 +; GFX9-COUNT-2: v_pk_min_u16 +; GFX1250: v_pk_min3_u16 +define amdgpu_kernel void @v_test_imin3_ult_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %gep1 = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid + %gep2 = getelementptr i32, ptr addrspace(1) %cptr, i32 %tid + %outgep = getelementptr <2 x i16>, ptr addrspace(1) %out, i32 %tid + %a = load <2 x i16>, ptr addrspace(1) %gep0 + %b = load <2 x i16>, ptr addrspace(1) %gep1 + %c = load <2 x i16>, ptr addrspace(1) %gep2 + %icmp0 = icmp ult <2 x i16> %a, %b + %i0 = select <2 x i1> %icmp0, <2 x i16> %a, <2 x i16> %b + %icmp1 = icmp ult <2 x i16> %i0, %c + %i1 = select <2 x i1> %icmp1, <2 x i16> %i0, <2 x i16> %c + store <2 x i16> %i1, ptr addrspace(1) %outgep + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 5d0e4bf..8fe68ba 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1084,10 +1076,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -1900,28 +1892,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -1932,7 +1917,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -2471,10 +2455,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll new file mode 100644 index 0000000..64392a1 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll @@ -0,0 +1,436 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b32_idx32(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b32_idx32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr %p, i32 %idx + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) { +; SDAG-LABEL: flat_load_b32_idxprom_wrong_stride: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] +; SDAG-NEXT: flat_load_b32 v0, v[0:1] +; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: flat_load_b32_idxprom_wrong_stride: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1] +; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b16_idxprom_ioffset(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd + %ld = load i16, ptr %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @flat_load_b64_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom + %ret = load <2 x float>, ptr %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxpromi_ioffset(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b96_idxpromi_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @flat_load_b128_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom + %ret = load <4 x float>, ptr %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps float @flat_load_b32_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @flat_load_b32_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] offset:64 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxadd + %ret = load float, ptr %arrayidx, align 4 + ret float %ret +} + +; Note: this is a byte load, there is nothing to scale + +define amdgpu_ps float @flat_load_b8_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_u8 v0, v0, s[0:1] offset:16 +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr %p, i64 %idxadd + %ld = load i8, ptr %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @flat_load_b16_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom + %ld = load i16, ptr %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @flat_load_b16_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd + %ld = load i16, ptr %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @flat_load_b64_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom + %ret = load <2 x float>, ptr %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b96_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd + %ret = load <3 x float>, ptr %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @flat_load_b128_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) { +; GCN-LABEL: flat_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: flat_load_b32 v0, v[0:1] +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom + %ret = load <4 x float>, ptr %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_store_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom + store float 1.0, ptr %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) { +; GCN-LABEL: flat_store_b16_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom + store i16 1, ptr %arrayidx, align 2 + ret void +} + +define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_store_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 +; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds double, ptr %p, i64 %idxprom + store double 1.0, ptr %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx) { +; GCN-LABEL: flat_atomicrmw_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idxprom + atomicrmw add ptr %arrayidx, i32 1 monotonic + ret void +} + +define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %p, i32 %idx) { +; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1] +; SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; SDAG-NEXT: s_mov_b32 s0, exec_lo +; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; SDAG-NEXT: s_cbranch_execnz .LBB21_3 +; SDAG-NEXT: ; %bb.1: ; %Flow +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; SDAG-NEXT: s_cbranch_execnz .LBB21_4 +; SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: s_branch .LBB21_5 +; SDAG-NEXT: .LBB21_3: ; %atomicrmw.global +; SDAG-NEXT: v_mov_b64_e32 v[0:1], 1 +; SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; SDAG-NEXT: s_wait_xcnt 0x0 +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; SDAG-NEXT: s_cbranch_execz .LBB21_2 +; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo +; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; SDAG-NEXT: s_wait_loadcnt 0x0 +; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; SDAG-NEXT: s_wait_xcnt 0x0 +; SDAG-NEXT: s_wait_alu 0xfffe +; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; SDAG-NEXT: s_branch .LBB21_5 +; SDAG-NEXT: .LBB21_5: +; +; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GISEL-NEXT: s_mov_b64 s[2:3], src_private_base +; GISEL-NEXT: s_mov_b32 s2, exec_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[2:3] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0 +; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo +; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_cmpx_ne_u32_e64 s3, v5 +; GISEL-NEXT: s_xor_b32 s2, exec_lo, s2 +; GISEL-NEXT: s_cbranch_execnz .LBB21_3 +; GISEL-NEXT: ; %bb.1: ; %Flow +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 +; GISEL-NEXT: s_cbranch_execnz .LBB21_4 +; GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_branch .LBB21_5 +; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global +; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1 +; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GISEL-NEXT: s_wait_xcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2 +; GISEL-NEXT: s_cbranch_execz .LBB21_2 +; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private +; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GISEL-NEXT: s_wait_alu 0xfffd +; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off +; GISEL-NEXT: s_wait_xcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GISEL-NEXT: s_branch .LBB21_5 +; GISEL-NEXT: .LBB21_5: +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i64, ptr %p, i64 %idxprom + %ret = atomicrmw add ptr %arrayidx, i64 1 monotonic + %ret.cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %ret.cast +} + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll new file mode 100644 index 0000000..faea84e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b32_idx32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i32 %idx + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; SDAG-LABEL: global_load_b32_idxprom_wrong_stride: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1] +; SDAG-NEXT: global_load_b32 v0, v[0:1], off +; SDAG-NEXT: s_wait_loadcnt 0x0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: global_load_b32_idxprom_wrong_stride: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1] +; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GISEL-NEXT: global_load_b32 v0, v[0:1], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd + %ld = load i16, ptr addrspace(1) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b96_idxpromi_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxadd + %ret = load float, ptr addrspace(1) %arrayidx, align 4 + ret float %ret +} + +; Note: this is a byte load, there is nothing to scale + +define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16 +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %idxadd + %ld = load i8, ptr addrspace(1) %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom + %ld = load i16, ptr addrspace(1) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd + %ld = load i16, ptr addrspace(1) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b96_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) { +; GCN-LABEL: global_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: global_load_b32 v0, v[0:1], off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_store_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom + store float 1.0, ptr addrspace(1) %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) { +; GCN-LABEL: global_store_b16_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom + store i16 1, ptr addrspace(1) %arrayidx, align 2 + ret void +} + +define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_store_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 +; GCN-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds double, ptr addrspace(1) %p, i64 %idxprom + store double 1.0, ptr addrspace(1) %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: global_atomicrmw_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GCN-NEXT: s_endpgm +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %idxprom + atomicrmw add ptr addrspace(1) %arrayidx, i32 1 monotonic + ret void +} + +define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) { +; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b64_e32 v[2:3], 1 +; GCN-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %p, i64 %idxprom + %ret = atomicrmw add ptr addrspace(1) %arrayidx, i64 1 monotonic + %ret.cast = bitcast i64 %ret to <2 x float> + ret <2 x float> %ret.cast +} + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll new file mode 100644 index 0000000..27ecc83 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll @@ -0,0 +1,322 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s + +define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) { +; GCN-LABEL: scratch_load_b32_alloca_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %p = alloca [32 x i32], align 4, addrspace(5) + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idx32(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b32_idx32: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i32 %idx + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom_wrong_stride(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b32_idxprom_wrong_stride: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-NEXT: scratch_load_b32 v0, v0, s0 +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b16_idxprom_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd + %ld = load i16, ptr addrspace(5) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @scratch_load_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @scratch_load_b96_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @scratch_load_b96_idxpromi_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b96_idxpromi_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @scratch_load_b128_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b32 v0, v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b32_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b32 v0, v0, s0 offset:64 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxadd + %ret = load float, ptr addrspace(5) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @scratch_load_b8_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_u8 v0, v0, s0 offset:16 +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr addrspace(5) %p, i64 %idxadd + %ld = load i8, ptr addrspace(5) %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @scratch_load_b16_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_u16 v0, v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom + %ld = load i16, ptr addrspace(5) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @scratch_load_b16_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_u16 v0, v0, s0 offset:32 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxadd + %ld = load i16, ptr addrspace(5) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @scratch_load_b64_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b64 v[0:1], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <2 x float> %ret +} + +; Multiplication is unsigned here, so we cannot match it. + +define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b96_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = sext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(5) %p, i64 %idxadd + %ret = load <3 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @scratch_load_b128_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) { +; GCN-LABEL: scratch_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: scratch_load_b32 v0, v0, off +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: scratch_load_b128 v[0:3], v0, s0 scale_offset +; GCN-NEXT: s_wait_loadcnt 0x0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(5) %pp, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(5) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(5) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_store_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1.0 +; GCN-NEXT: scratch_store_b32 v0, v1, s0 scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(5) %p, i64 %idxprom + store float 1.0, ptr addrspace(5) %arrayidx, align 4 + ret void +} + +define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_store_b16_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: scratch_store_b16 v0, v1, s0 scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(5) %p, i64 %idxprom + store i16 1, ptr addrspace(5) %arrayidx, align 2 + ret void +} + +define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) { +; GCN-LABEL: scratch_store_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0 +; GCN-NEXT: scratch_store_b64 v0, v[2:3], s0 scale_offset +; GCN-NEXT: s_endpgm +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds double, ptr addrspace(5) %p, i64 %idxprom + store double 1.0, ptr addrspace(5) %arrayidx, align 4 + ret void +} + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll new file mode 100644 index 0000000..b5bb68e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll @@ -0,0 +1,372 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +; 'i32 %idx' is a signed index while SMRD soffset is unsigned, thus it is not selected. + +define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; SDAG-LABEL: s_load_b32_idx32: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_ashr_i32 s3, s2, 31 +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0 +; SDAG-NEXT: s_wait_kmcnt 0x0 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_load_b32_idx32: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_ashr_i32 s3, s2, 31 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GISEL-NEXT: s_add_co_u32 s0, s0, s2 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i32 %idx + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; SDAG-LABEL: s_load_b32_idxprom_wrong_stride: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_mov_b32 s3, 0 +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0 +; SDAG-NEXT: s_wait_kmcnt 0x0 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_load_b32_idxprom_wrong_stride: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_mov_b32 s3, 0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; GISEL-NEXT: s_add_co_u32 s0, s0, s2 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b256_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <8 x float> %ret +} + +define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b512_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <16 x float> %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxadd + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +; Note: this is a byte load, there is nothing to scale + +define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr addrspace(4) %p, i64 %idxadd + %ld = load i8, ptr addrspace(4) %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxprom + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b256_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <8 x float> %ret +} + +define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b512_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <16 x float> %ret +} + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index a6b8ea3..6da7d1b 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1 ; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2 ; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0 ; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 @@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8 ; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10 ; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 +; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index f4b9523..af8b9e7 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -966,3 +966,45 @@ body: | $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec $sgpr0 = S_MOV_B32 0 ... + +# TODO: Unnecessary wait before overwriting vgpr0. +--- +name: overwrite_vgpr_after_smem +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1 + ; GCN-LABEL: name: overwrite_vgpr_after_smem + ; GCN: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec +... + +# TODO: Unnecessary wait before overwriting sgpr0. +--- +name: overwrite_sgpr_after_vmem +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1 + ; GCN-LABEL: name: overwrite_sgpr_after_vmem + ; GCN: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 +... |