diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
53 files changed, 16512 insertions, 6088 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 9b35920..fa4676e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3211,7 +3211,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_ieee_mode = 1 ; GFX10-NEXT: enable_wgp_mode = 1 ; GFX10-NEXT: enable_mem_ordered = 1 -; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_fwd_progress = 1 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 @@ -3303,7 +3303,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: enable_ieee_mode = 1 ; GFX11-NEXT: enable_wgp_mode = 1 ; GFX11-NEXT: enable_mem_ordered = 1 -; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_fwd_progress = 1 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 @@ -4215,7 +4215,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_ieee_mode = 1 ; GFX10-NEXT: enable_wgp_mode = 1 ; GFX10-NEXT: enable_mem_ordered = 1 -; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_fwd_progress = 1 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 @@ -4300,7 +4300,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_ieee_mode = 1 ; GFX11-NEXT: enable_wgp_mode = 1 ; GFX11-NEXT: enable_mem_ordered = 1 -; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_fwd_progress = 1 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 @@ -4569,7 +4569,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_ieee_mode = 1 ; GFX10-NEXT: enable_wgp_mode = 1 ; GFX10-NEXT: enable_mem_ordered = 1 -; GFX10-NEXT: enable_fwd_progress = 0 +; GFX10-NEXT: enable_fwd_progress = 1 ; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX10-NEXT: user_sgpr_count = 14 ; GFX10-NEXT: enable_trap_handler = 0 @@ -4657,7 +4657,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_ieee_mode = 1 ; GFX11-NEXT: enable_wgp_mode = 1 ; GFX11-NEXT: enable_mem_ordered = 1 -; GFX11-NEXT: enable_fwd_progress = 0 +; GFX11-NEXT: enable_fwd_progress = 1 ; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 ; GFX11-NEXT: user_sgpr_count = 13 ; GFX11-NEXT: enable_trap_handler = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir index 5b8c284..dde566d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s --- name: bswap_i32_vv @@ -19,6 +21,7 @@ body: | ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935 ; GFX7-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_e64_1]], [[V_ALIGNBIT_B32_e64_]], implicit $exec ; GFX7-NEXT: S_ENDPGM 0, implicit [[V_BFI_B32_e64_]] + ; ; GFX8-LABEL: name: bswap_i32_vv ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -26,6 +29,22 @@ body: | ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 ; GFX8-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX9-LABEL: name: bswap_i32_vv + ; GFX9: liveins: $vgpr0 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] + ; + ; GFX10-LABEL: name: bswap_i32_vv + ; GFX10: liveins: $vgpr0 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 + ; GFX10-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_BSWAP %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir index 0a4cb3cc..fa95f33 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s --- @@ -24,6 +24,24 @@ body: | ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]] ; + ; GFX9-LABEL: name: fshr_s32 + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9-NEXT: {{ $}} + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; + ; GFX10-LABEL: name: fshr_s32 + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10-NEXT: {{ $}} + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec + ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] + ; ; GFX11-LABEL: name: fshr_s32 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir index be3fe91..4f5f52b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memcpy_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memcpy_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMCPY %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir index a82ca30..0392aef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memcpyinline_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memcpyinline_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMCPY_INLINE %2:_(p0), %5:_(p0), %7:_(s64) :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir index e7cfaab..1f8d1aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir @@ -31,3 +31,33 @@ body: | S_ENDPGM 0 ... +--- +name: memmove_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + + ; CHECK-LABEL: name: memmove_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV1]](p0) :: (volatile load (s8)) + ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s32) = COPY $vgpr3 + %5:_(p0) = G_MERGE_VALUES %3:_(s32), %4:_(s32) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMMOVE %2:_(p0), %5:_(p0), %7:_(s64), 0 :: (volatile store (s8)), (volatile load (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir index 021cebb..dda94e15 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir @@ -30,3 +30,32 @@ body: | S_ENDPGM 0 ... +--- +name: memset_test_volatile +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; CHECK-LABEL: name: memset_test_volatile + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY2]](s32) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s8) = COPY [[TRUNC]](s8) + ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[MV]](p0) :: (volatile store (s8)) + ; CHECK-NEXT: S_ENDPGM 0 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(p0) = G_MERGE_VALUES %0:_(s32), %1:_(s32) + %3:_(s32) = COPY $vgpr2 + %4:_(s16) = G_TRUNC %3:_(s32) + %5:_(s8) = G_TRUNC %4:_(s16) + %6:_(s32) = G_CONSTANT i32 1 + %7:_(s64) = G_ZEXT %6:_(s32) + G_MEMSET %2:_(p0), %5:_(s8), %7:_(s64), 0 :: (volatile store (s8)) + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir index cd69104..69e3561 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir @@ -80,8 +80,7 @@ body: | ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]] ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32) - ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 - ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[ASHR]](s32) ; ; GFX9-LABEL: name: test_smulh_s16 ; GFX9: liveins: $vgpr0, $vgpr1 @@ -93,8 +92,7 @@ body: | ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG]], [[SEXT_INREG1]] ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32) - ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 - ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[ASHR]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = COPY $vgpr1 %2:_(s16) = G_TRUNC %0 @@ -200,9 +198,7 @@ body: | ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 16 ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG2]], [[SEXT_INREG3]] ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[MUL1]], [[C]](s32) - ; GFX9-NEXT: [[SEXT_INREG4:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16 - ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR1]], 16 - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG4]](s32), [[SEXT_INREG5]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ASHR]](s32), [[ASHR1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 %1:_(<2 x s32>) = COPY $vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir index 2c545c8..1025d60 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-sbfx.mir @@ -92,8 +92,7 @@ body: | ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; GCN-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; GCN-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32) - ; GCN-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 20 - ; GCN-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32) + ; GCN-NEXT: $vgpr0 = COPY [[ASHR]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s32) = G_CONSTANT i32 16 %2:_(s32) = G_ASHR %0, %1(s32) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir new file mode 100644 index 0000000..beca901 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-whole-wave-functions.mir @@ -0,0 +1,40 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s +--- +name: basic_test +legalized: true +machineFunctionInfo: + isWholeWaveFunction: true +body: | + bb.1: + liveins: $vgpr0, $vgpr1 + + ; CHECK-LABEL: name: basic_test + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:vcc(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[COPY2]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32) + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:vgpr(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[COPY3]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0 + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %1:_(s32) = COPY $vgpr0 + %2:_(s32) = COPY $vgpr1 + %0:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + %12:_(s32) = G_CONSTANT i32 5 + %11:_(s32) = G_SELECT %0(s1), %1, %12 + %14:_(s32) = G_CONSTANT i32 3 + %13:_(s32) = G_SELECT %0(s1), %2, %14 + %15:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), %11(s32), %13(s32), 1, 1, 1, 0 + $vgpr0 = COPY %15(s32) + G_AMDGPU_WHOLE_WAVE_FUNC_RETURN %0(s1), implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll index d4826a2..6044f6e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll @@ -7,7 +7,7 @@ ; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}} ; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf02c0{{$}} ; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}} -; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x600f0000{{$}} +; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xe00f0000{{$}} define amdgpu_cs half @cs_amdpal(half %arg0) #0 { %add = fadd half %arg0, 1.0 ret half %add diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll index 7ce5a00..d91b2117 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noalias-addrspace.ll @@ -514,9 +514,9 @@ define internal void @callee_no_alias_addr_space_select(ptr %ptr1, ptr %ptr2, pt ret void } -define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val, i32 %offset) #0 { +define internal void @callee_alias_addr_space_branch(ptr %ptr1, ptr %ptr2, ptr %ptr3, i1 %cond1, i1 %cond2, i32 %val) #0 { ; CHECK-LABEL: define internal void @callee_alias_addr_space_branch( -; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1]] { +; CHECK-SAME: ptr [[PTR1:%.*]], ptr [[PTR2:%.*]], ptr [[PTR3:%.*]], i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[VAL:%.*]]) #[[ATTR1]] { ; CHECK-NEXT: br i1 [[COND1]], label %[[BB_1_TRUE:.*]], label %[[BB_1_FALSE:.*]] ; CHECK: [[BB_1_TRUE]]: ; CHECK-NEXT: br label %[[BB_1_END:.*]] diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 029604c..b49614d 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -2,6 +2,27 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s ; TODO: Add global-isel when it can support bf16 +define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) { +; GCN-LABEL: llvm_sqrt_bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_sqrt_bf16_e32 v2, v2 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src) + store bfloat %sqrt, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) { +; GCN-LABEL: llvm_sqrt_bf16_s: +; GCN: ; %bb.0: +; GCN-NEXT: v_sqrt_bf16_e32 v2, s0 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %src) + store bfloat %sqrt, ptr addrspace(1) %out, align 2 + ret void +} define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) { ; GCN-LABEL: llvm_log2_bf16_v: @@ -47,5 +68,6 @@ define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src ret void } +declare bfloat @llvm.sqrt.bf16(bfloat) declare bfloat @llvm.log2.bf16(bfloat) declare bfloat @llvm.exp2.bf16(bfloat) diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index cd6d741..7859fcdf 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=amdgcn | FileCheck %s -check-prefixes=GCN ; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefixes=GFX7 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck %s -check-prefixes=GFX8 -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 | FileCheck %s -check-prefixes=GFX9,GFX900 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 | FileCheck %s -check-prefixes=GFX9,GFX950 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 @@ -967,12 +968,21 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_store_global_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_store_global_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_store_dword v[1:2], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_store_global_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: global_store_dword v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_store_global_v2bf16: ; GFX10: ; %bb.0: @@ -2019,23 +2029,41 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_store_global_v64bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 -; GFX9-NEXT: global_store_dwordx4 v[32:33], v[0:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_store_global_v64bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 +; GFX900-NEXT: global_store_dwordx4 v[32:33], v[0:3], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_store_global_v64bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[28:31], off offset:112 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[24:27], off offset:96 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[20:23], off offset:80 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[16:19], off offset:64 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[12:15], off offset:48 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[8:11], off offset:32 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[4:7], off offset:16 +; GFX950-NEXT: global_store_dwordx4 v[32:33], v[0:3], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_store_global_v64bf16: ; GFX10: ; %bb.0: @@ -2204,20 +2232,30 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_load_store_f32_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_load_store_f32_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v0, v[0:1], off +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX900-NEXT: global_store_short_d16_hi v[2:3], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_load_store_f32_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v0, v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: global_store_short v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_load_store_f32_to_bf16: ; GFX10: ; %bb.0: @@ -2308,30 +2346,50 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_load_store_f64_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| -; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_add3_u32 v4, v5, v4, s8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc -; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_load_store_f64_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX900-NEXT: s_movk_i32 s8, 0x7fff +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX900-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 +; GFX900-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| +; GFX900-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] +; GFX900-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX900-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX900-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_add3_u32 v4, v5, v4, s8 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc +; GFX900-NEXT: global_store_short_d16_hi v[2:3], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_load_store_f64_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v6 +; GFX950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; GFX950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 +; GFX950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] +; GFX950-NEXT: v_add_u32_e32 v0, v6, v0 +; GFX950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: global_store_short v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_load_store_f64_to_bf16: ; GFX10: ; %bb.0: @@ -2858,12 +2916,21 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_arg_store: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_short v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_arg_store: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_store_short v[1:2], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_arg_store: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: global_store_short v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_arg_store: ; GFX10: ; %bb.0: @@ -2918,12 +2985,21 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_arg_store_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[1:2], v0, off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_arg_store_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_store_dword v[1:2], v0, off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_arg_store_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: global_store_dword v[2:3], v0, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_arg_store_v2bf16: ; GFX10: ; %bb.0: @@ -3384,12 +3460,19 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_byval: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_byval: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_byval: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_short off, v0, s32 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_byval: ; GFX10: ; %bb.0: @@ -3440,12 +3523,19 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_sret: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_sret: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_sret: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_short v0, v1, off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_sret: ; GFX10: ; %bb.0: @@ -3907,34 +3997,63 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v2, s30, 0 -; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v2, 1 -; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v2, s30, 0 +; GFX900-NEXT: v_writelane_b32 v2, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v2, 1 +; GFX900-NEXT: v_readlane_b32 s30, v2, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v4, s30, 0 +; GFX950-NEXT: v_writelane_b32 v4, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_short v1, v0, off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v4, 1 +; GFX950-NEXT: v_readlane_b32 s30, v4, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call: ; GFX10: ; %bb.0: ; %entry @@ -4104,34 +4223,63 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v2bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v2, s30, 0 -; GFX9-NEXT: v_writelane_b32 v2, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v2, 1 -; GFX9-NEXT: v_readlane_b32 s30, v2, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v2bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v2, s30, 0 +; GFX900-NEXT: v_writelane_b32 v2, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v2, 1 +; GFX900-NEXT: v_readlane_b32 s30, v2, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v2bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v4, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v4, s30, 0 +; GFX950-NEXT: v_writelane_b32 v4, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dword v1, v0, off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v4, 1 +; GFX950-NEXT: v_readlane_b32 s30, v4, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v4, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v2bf16: ; GFX10: ; %bb.0: ; %entry @@ -4308,36 +4456,68 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v3bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v3bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v3, s30, 0 +; GFX900-NEXT: v_writelane_b32 v3, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v3, 1 +; GFX900-NEXT: v_readlane_b32 s30, v3, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v3bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 +; GFX950-NEXT: v_mov_b32_e32 v4, v2 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_short v4, v1, off offset:4 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: scratch_store_dword v4, v0, off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v5, 1 +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v3bf16: ; GFX10: ; %bb.0: ; %entry @@ -4534,36 +4714,66 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v4bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v3, s30, 0 -; GFX9-NEXT: v_writelane_b32 v3, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v3, 1 -; GFX9-NEXT: v_readlane_b32 s30, v3, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v4bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v3, s30, 0 +; GFX900-NEXT: v_writelane_b32 v3, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v3, 1 +; GFX900-NEXT: v_readlane_b32 s30, v3, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v4bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 +; GFX950-NEXT: v_mov_b32_e32 v4, v2 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v5, 1 +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v4bf16: ; GFX10: ; %bb.0: ; %entry @@ -4804,40 +5014,69 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v8bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v5, s30, 0 -; GFX9-NEXT: v_writelane_b32 v5, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v5, 1 -; GFX9-NEXT: v_readlane_b32 s30, v5, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v8bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v5, s30, 0 +; GFX900-NEXT: v_writelane_b32 v5, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen offset:8 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v5, 1 +; GFX900-NEXT: v_readlane_b32 s30, v5, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v8bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v5, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v5, s30, 0 +; GFX950-NEXT: v_writelane_b32 v5, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dwordx4 v4, v[0:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v5, 1 +; GFX950-NEXT: v_readlane_b32 s30, v5, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v5, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v8bf16: ; GFX10: ; %bb.0: ; %entry @@ -5174,48 +5413,79 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_call_v16bf16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s18, s33 -; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: s_xor_saveexec_b64 s[16:17], -1 -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[16:17] -; GFX9-NEXT: s_addk_i32 s32, 0x400 -; GFX9-NEXT: s_getpc_b64 s[16:17] -; GFX9-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 -; GFX9-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX9-NEXT: v_writelane_b32 v9, s30, 0 -; GFX9-NEXT: v_writelane_b32 v9, s31, 1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX9-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readlane_b32 s31, v9, 1 -; GFX9-NEXT: v_readlane_b32 s30, v9, 0 -; GFX9-NEXT: s_mov_b32 s32, s33 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s33, s18 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_call_v16bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s18, s33 +; GFX900-NEXT: s_mov_b32 s33, s32 +; GFX900-NEXT: s_xor_saveexec_b64 s[16:17], -1 +; GFX900-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[16:17] +; GFX900-NEXT: s_addk_i32 s32, 0x400 +; GFX900-NEXT: s_getpc_b64 s[16:17] +; GFX900-NEXT: s_add_u32 s16, s16, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX900-NEXT: s_addc_u32 s17, s17, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX900-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 +; GFX900-NEXT: v_writelane_b32 v9, s30, 0 +; GFX900-NEXT: v_writelane_b32 v9, s31, 1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX900-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen offset:28 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen offset:24 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen offset:20 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v4, v8, s[0:3], 0 offen offset:16 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen offset:12 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v2, v8, s[0:3], 0 offen offset:8 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v1, v8, s[0:3], 0 offen offset:4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_readlane_b32 s31, v9, 1 +; GFX900-NEXT: v_readlane_b32 s30, v9, 0 +; GFX900-NEXT: s_mov_b32 s32, s33 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_mov_b32 s33, s18 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_call_v16bf16: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_mov_b32 s2, s33 +; GFX950-NEXT: s_mov_b32 s33, s32 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_store_dword off, v9, s33 ; 4-byte Folded Spill +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_add_i32 s32, s32, 16 +; GFX950-NEXT: s_getpc_b64 s[0:1] +; GFX950-NEXT: s_add_u32 s0, s0, test_arg_store_v2bf16@gotpcrel32@lo+4 +; GFX950-NEXT: s_addc_u32 s1, s1, test_arg_store_v2bf16@gotpcrel32@hi+12 +; GFX950-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX950-NEXT: v_writelane_b32 v9, s30, 0 +; GFX950-NEXT: v_writelane_b32 v9, s31, 1 +; GFX950-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX950-NEXT: scratch_store_dwordx4 v8, v[4:7], off offset:16 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: scratch_store_dwordx4 v8, v[0:3], off sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_readlane_b32 s31, v9, 1 +; GFX950-NEXT: v_readlane_b32 s30, v9, 0 +; GFX950-NEXT: s_mov_b32 s32, s33 +; GFX950-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GFX950-NEXT: scratch_load_dword v9, off, s33 ; 4-byte Folded Reload +; GFX950-NEXT: s_mov_b64 exec, s[0:1] +; GFX950-NEXT: s_mov_b32 s33, s2 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v16bf16: ; GFX10: ; %bb.0: ; %entry @@ -5332,14 +5602,23 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_alloca_load_store_ret: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], s32 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_alloca_load_store_ret: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_short v0, off, s[0:3], s32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_alloca_load_store_ret: +; GFX950: ; %bb.0: ; %entry +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_store_short off, v0, s32 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: scratch_load_ushort v0, off, s32 sc0 sc1 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_alloca_load_store_ret: ; GFX10: ; %bb.0: ; %entry @@ -5625,52 +5904,72 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_overflow_stack: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: s_waitcnt vmcnt(25) -; GFX9-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: test_overflow_stack: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen offset:112 +; GFX900-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen offset:108 +; GFX900-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen offset:104 +; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:100 +; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:96 +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:92 +; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:8 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_store_dword v24, v0, s[0:3], 0 offen offset:88 +; GFX900-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen offset:84 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:80 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:76 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:72 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:68 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:64 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:60 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:56 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:52 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:48 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:44 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:40 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:36 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:32 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; GFX900-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen offset:124 +; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen offset:120 +; GFX900-NEXT: s_waitcnt vmcnt(25) +; GFX900-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen offset:116 +; GFX900-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:128 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: test_overflow_stack: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off +; GFX950-NEXT: s_waitcnt vmcnt(7) +; GFX950-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112 +; GFX950-NEXT: scratch_store_short v0, v1, off offset:128 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_overflow_stack: ; GFX10: ; %bb.0: @@ -5870,15 +6169,25 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v3bf16_to_v3f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v3bf16_to_v3f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v3bf16_to_v3f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[2:3], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v3bf16_to_v3f32: ; GFX10: ; %bb.0: @@ -6120,18 +6429,31 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v6bf16_to_v6f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx3 v[3:5], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v6bf16_to_v6f32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx3 v[3:5], v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v6bf16_to_v6f32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx3 v[4:6], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v6bf16_to_v6f32: ; GFX10: ; %bb.0: @@ -6766,16 +7088,27 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v2bf16_to_v2f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v2bf16_to_v2f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dword v2, v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v2bf16_to_v2f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dword v0, v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v2bf16_to_v2f64: ; GFX10: ; %bb.0: @@ -6852,18 +7185,31 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v3bf16_to_v3f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v3bf16_to_v3f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_dwordx2 v[1:2], v[0:1], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v3bf16_to_v3f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v3bf16_to_v3f64: ; GFX10: ; %bb.0: @@ -8476,193 +8822,363 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: global_extload_v32bf16_to_v32f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v9, v[1:2], off offset:62 -; GFX9-NEXT: global_load_ushort v11, v[1:2], off offset:60 -; GFX9-NEXT: global_load_ushort v12, v[1:2], off offset:58 -; GFX9-NEXT: global_load_ushort v13, v[1:2], off offset:56 -; GFX9-NEXT: global_load_ushort v14, v[1:2], off offset:54 -; GFX9-NEXT: global_load_ushort v15, v[1:2], off offset:52 -; GFX9-NEXT: global_load_ushort v16, v[1:2], off offset:50 -; GFX9-NEXT: global_load_ushort v17, v[1:2], off offset:48 -; GFX9-NEXT: global_load_ushort v18, v[1:2], off offset:46 -; GFX9-NEXT: global_load_ushort v19, v[1:2], off offset:44 -; GFX9-NEXT: global_load_ushort v20, v[1:2], off offset:42 -; GFX9-NEXT: global_load_ushort v21, v[1:2], off offset:40 -; GFX9-NEXT: global_load_ushort v22, v[1:2], off offset:38 -; GFX9-NEXT: global_load_ushort v23, v[1:2], off offset:36 -; GFX9-NEXT: global_load_ushort v24, v[1:2], off offset:34 -; GFX9-NEXT: global_load_ushort v25, v[1:2], off offset:32 -; GFX9-NEXT: global_load_ushort v26, v[1:2], off -; GFX9-NEXT: global_load_ushort v27, v[1:2], off offset:2 -; GFX9-NEXT: global_load_ushort v3, v[1:2], off offset:16 -; GFX9-NEXT: global_load_ushort v4, v[1:2], off offset:18 -; GFX9-NEXT: global_load_ushort v5, v[1:2], off offset:20 -; GFX9-NEXT: global_load_ushort v6, v[1:2], off offset:22 -; GFX9-NEXT: global_load_ushort v8, v[1:2], off offset:24 -; GFX9-NEXT: global_load_ushort v28, v[1:2], off offset:30 -; GFX9-NEXT: global_load_ushort v29, v[1:2], off offset:26 -; GFX9-NEXT: global_load_ushort v30, v[1:2], off offset:28 -; GFX9-NEXT: global_load_ushort v31, v[1:2], off offset:4 -; GFX9-NEXT: global_load_ushort v32, v[1:2], off offset:6 -; GFX9-NEXT: global_load_ushort v33, v[1:2], off offset:8 -; GFX9-NEXT: global_load_ushort v34, v[1:2], off offset:10 -; GFX9-NEXT: global_load_ushort v7, v[1:2], off offset:12 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v13 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; GFX9-NEXT: s_waitcnt vmcnt(29) -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v12 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX9-NEXT: s_waitcnt vmcnt(31) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v15 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 -; GFX9-NEXT: s_waitcnt vmcnt(32) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v21 -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v23 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v22 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v21 -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v20 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 -; GFX9-NEXT: s_waitcnt vmcnt(44) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v25 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v31 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 -; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; GFX9-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 -; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: s_waitcnt vmcnt(40) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v10 -; GFX9-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: global_extload_v32bf16_to_v32f64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: global_load_ushort v9, v[1:2], off offset:62 +; GFX900-NEXT: global_load_ushort v11, v[1:2], off offset:60 +; GFX900-NEXT: global_load_ushort v12, v[1:2], off offset:58 +; GFX900-NEXT: global_load_ushort v13, v[1:2], off offset:56 +; GFX900-NEXT: global_load_ushort v14, v[1:2], off offset:54 +; GFX900-NEXT: global_load_ushort v15, v[1:2], off offset:52 +; GFX900-NEXT: global_load_ushort v16, v[1:2], off offset:50 +; GFX900-NEXT: global_load_ushort v17, v[1:2], off offset:48 +; GFX900-NEXT: global_load_ushort v18, v[1:2], off offset:46 +; GFX900-NEXT: global_load_ushort v19, v[1:2], off offset:44 +; GFX900-NEXT: global_load_ushort v20, v[1:2], off offset:42 +; GFX900-NEXT: global_load_ushort v21, v[1:2], off offset:40 +; GFX900-NEXT: global_load_ushort v22, v[1:2], off offset:38 +; GFX900-NEXT: global_load_ushort v23, v[1:2], off offset:36 +; GFX900-NEXT: global_load_ushort v24, v[1:2], off offset:34 +; GFX900-NEXT: global_load_ushort v25, v[1:2], off offset:32 +; GFX900-NEXT: global_load_ushort v26, v[1:2], off +; GFX900-NEXT: global_load_ushort v27, v[1:2], off offset:2 +; GFX900-NEXT: global_load_ushort v3, v[1:2], off offset:16 +; GFX900-NEXT: global_load_ushort v4, v[1:2], off offset:18 +; GFX900-NEXT: global_load_ushort v5, v[1:2], off offset:20 +; GFX900-NEXT: global_load_ushort v6, v[1:2], off offset:22 +; GFX900-NEXT: global_load_ushort v8, v[1:2], off offset:24 +; GFX900-NEXT: global_load_ushort v28, v[1:2], off offset:30 +; GFX900-NEXT: global_load_ushort v29, v[1:2], off offset:26 +; GFX900-NEXT: global_load_ushort v30, v[1:2], off offset:28 +; GFX900-NEXT: global_load_ushort v31, v[1:2], off offset:4 +; GFX900-NEXT: global_load_ushort v32, v[1:2], off offset:6 +; GFX900-NEXT: global_load_ushort v33, v[1:2], off offset:8 +; GFX900-NEXT: global_load_ushort v34, v[1:2], off offset:10 +; GFX900-NEXT: global_load_ushort v7, v[1:2], off offset:12 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: global_load_ushort v1, v[1:2], off offset:14 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v11 +; GFX900-NEXT: s_waitcnt vmcnt(28) +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v13 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:252 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:248 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v12 +; GFX900-NEXT: s_waitcnt vmcnt(29) +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:244 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:240 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:236 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:232 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v12 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v16 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v17 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:228 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:224 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GFX900-NEXT: s_waitcnt vmcnt(31) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v18 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v14 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:220 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:216 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v15 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v16 +; GFX900-NEXT: s_waitcnt vmcnt(32) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v19 +; GFX900-NEXT: s_waitcnt vmcnt(30) +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v21 +; GFX900-NEXT: s_waitcnt vmcnt(28) +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v23 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:212 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:208 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:204 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:200 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:196 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:192 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v21 +; GFX900-NEXT: s_waitcnt vmcnt(33) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v17 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v19 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v20 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:188 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:184 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:180 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:176 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:172 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:168 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:164 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:160 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:156 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:152 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:148 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v2 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen offset:144 +; GFX900-NEXT: s_waitcnt vmcnt(44) +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:140 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:136 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v11 +; GFX900-NEXT: s_waitcnt vmcnt(38) +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:132 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:128 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v13 +; GFX900-NEXT: s_waitcnt vmcnt(38) +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:124 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:120 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:116 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:112 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[9:10], v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[11:12], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[13:14], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[15:16], v2 +; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; GFX900-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:108 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:104 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[17:18], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[19:20], v2 +; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[21:22], v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:100 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:96 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[5:6], v2 +; GFX900-NEXT: s_waitcnt vmcnt(41) +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; GFX900-NEXT: s_waitcnt vmcnt(40) +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:92 +; GFX900-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:88 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:84 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:80 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:76 +; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 +; GFX900-NEXT: v_cvt_f64_f32_e32 v[1:2], v10 +; GFX900-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:68 +; GFX900-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:64 +; GFX900-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:60 +; GFX900-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:56 +; GFX900-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:52 +; GFX900-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 +; GFX900-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen offset:44 +; GFX900-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen offset:40 +; GFX900-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen offset:36 +; GFX900-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:32 +; GFX900-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:28 +; GFX900-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:24 +; GFX900-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen offset:20 +; GFX900-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen offset:16 +; GFX900-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen offset:12 +; GFX900-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen offset:8 +; GFX900-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen offset:4 +; GFX900-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: global_extload_v32bf16_to_v32f64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse +; GFX950-NEXT: global_load_ushort v1, v[2:3], off offset:2 +; GFX950-NEXT: global_load_ushort v4, v[2:3], off offset:12 +; GFX950-NEXT: global_load_ushort v5, v[2:3], off offset:8 +; GFX950-NEXT: global_load_ushort v6, v[2:3], off offset:4 +; GFX950-NEXT: global_load_ushort v7, v[2:3], off +; GFX950-NEXT: global_load_ushort v8, v[2:3], off offset:6 +; GFX950-NEXT: global_load_ushort v9, v[2:3], off offset:10 +; GFX950-NEXT: global_load_ushort v10, v[2:3], off offset:14 +; GFX950-NEXT: global_load_ushort v11, v[2:3], off offset:18 +; GFX950-NEXT: global_load_ushort v12, v[2:3], off offset:28 +; GFX950-NEXT: global_load_ushort v13, v[2:3], off offset:24 +; GFX950-NEXT: global_load_ushort v14, v[2:3], off offset:20 +; GFX950-NEXT: global_load_ushort v15, v[2:3], off offset:16 +; GFX950-NEXT: global_load_ushort v16, v[2:3], off offset:22 +; GFX950-NEXT: global_load_ushort v17, v[2:3], off offset:26 +; GFX950-NEXT: global_load_ushort v18, v[2:3], off offset:30 +; GFX950-NEXT: global_load_ushort v19, v[2:3], off offset:34 +; GFX950-NEXT: global_load_ushort v20, v[2:3], off offset:44 +; GFX950-NEXT: global_load_ushort v21, v[2:3], off offset:40 +; GFX950-NEXT: global_load_ushort v22, v[2:3], off offset:36 +; GFX950-NEXT: global_load_ushort v23, v[2:3], off offset:32 +; GFX950-NEXT: global_load_ushort v24, v[2:3], off offset:38 +; GFX950-NEXT: global_load_ushort v25, v[2:3], off offset:42 +; GFX950-NEXT: global_load_ushort v26, v[2:3], off offset:46 +; GFX950-NEXT: global_load_ushort v42, v[2:3], off offset:50 +; GFX950-NEXT: global_load_ushort v43, v[2:3], off offset:62 +; GFX950-NEXT: global_load_ushort v46, v[2:3], off offset:60 +; GFX950-NEXT: global_load_ushort v47, v[2:3], off offset:56 +; GFX950-NEXT: global_load_ushort v60, v[2:3], off offset:52 +; GFX950-NEXT: global_load_ushort v56, v[2:3], off offset:48 +; GFX950-NEXT: global_load_ushort v57, v[2:3], off offset:54 +; GFX950-NEXT: global_load_ushort v58, v[2:3], off offset:58 +; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(31) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: s_waitcnt vmcnt(30) +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v4 +; GFX950-NEXT: s_waitcnt vmcnt(29) +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v5 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 +; GFX950-NEXT: s_waitcnt vmcnt(27) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; GFX950-NEXT: s_waitcnt vmcnt(26) +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v8 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v9 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v10 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v11 +; GFX950-NEXT: s_waitcnt vmcnt(22) +; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v12 +; GFX950-NEXT: s_waitcnt vmcnt(21) +; GFX950-NEXT: v_lshlrev_b32_e32 v36, 16, v13 +; GFX950-NEXT: s_waitcnt vmcnt(20) +; GFX950-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX950-NEXT: s_waitcnt vmcnt(19) +; GFX950-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX950-NEXT: s_waitcnt vmcnt(18) +; GFX950-NEXT: v_lshlrev_b32_e32 v33, 16, v16 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[12:13], v27 +; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: v_lshlrev_b32_e32 v37, 16, v18 +; GFX950-NEXT: s_waitcnt vmcnt(15) +; GFX950-NEXT: v_lshlrev_b32_e32 v39, 16, v19 +; GFX950-NEXT: s_waitcnt vmcnt(14) +; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v20 +; GFX950-NEXT: s_waitcnt vmcnt(13) +; GFX950-NEXT: v_lshlrev_b32_e32 v40, 16, v21 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[14:15], v30 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[20:21], v31 +; GFX950-NEXT: s_waitcnt vmcnt(10) +; GFX950-NEXT: v_lshlrev_b32_e32 v49, 16, v24 +; GFX950-NEXT: s_waitcnt vmcnt(9) +; GFX950-NEXT: v_lshlrev_b32_e32 v53, 16, v25 +; GFX950-NEXT: s_waitcnt vmcnt(8) +; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v26 +; GFX950-NEXT: s_waitcnt vmcnt(7) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v43 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[18:19], v32 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[24:25], v33 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[26:27], v36 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[32:33], v37 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[30:31], v38 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[36:37], v39 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[38:39], v44 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v42 +; GFX950-NEXT: s_waitcnt vmcnt(5) +; GFX950-NEXT: v_lshlrev_b32_e32 v42, 16, v46 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v42 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v58 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:240 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v46 +; GFX950-NEXT: v_lshlrev_b32_e32 v46, 16, v47 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[44:45], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[42:43], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v57 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v46 +; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v48, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v52, 16, v22 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:224 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[10:11], v28 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[58:59], v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[16:17], v29 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[22:23], v34 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[28:29], v35 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[34:35], v48 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[50:51], v49 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[48:49], v52 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[54:55], v53 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[52:53], v40 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[40:41], v41 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[56:57], v1 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[8:9], v7 +; GFX950-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[56:59], off offset:208 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[42:45], off offset:192 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[38:41], off offset:176 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[52:55], off offset:160 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[48:51], off offset:144 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[34:37], off offset:128 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16 +; GFX950-NEXT: scratch_store_dwordx4 v0, v[2:5], off +; GFX950-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_extload_v32bf16_to_v32f64: ; GFX10: ; %bb.0: @@ -9050,20 +9566,29 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_bf16: ; GFX10: ; %bb.0: @@ -9178,29 +9703,41 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v2bf16: ; GFX10: ; %bb.0: @@ -9363,38 +9900,54 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v3bf16: ; GFX10: ; %bb.0: @@ -9604,46 +10157,65 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v4bf16: ; GFX10: ; %bb.0: @@ -9967,80 +10539,113 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_add_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v8bf16: ; GFX10: ; %bb.0: @@ -10656,148 +11261,209 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_add_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_add_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_add_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_add_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_add_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_add_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_add_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_add_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_add_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_add_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v16bf16: ; GFX10: ; %bb.0: @@ -12112,286 +12778,407 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_add_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_add_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_add_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_add_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_add_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_add_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_add_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_add_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_add_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_add_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_add_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_add_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_add_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_add_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_add_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_add_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_add_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_add_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_add_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_add_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_add_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_add_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_add_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_add_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_add_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_add_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_add_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_add_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_add_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_add_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_add_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_add_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_add_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_add_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_add_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_add_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_add_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_add_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_add_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_add_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_add_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_add_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_add_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_add_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_add_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_add_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_add_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_add_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_add_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_add_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_add_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_v32bf16: ; GFX10: ; %bb.0: @@ -13290,19 +14077,27 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_bf16_fpimm_0: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_bf16_fpimm_0: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_bf16_fpimm_0: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_bf16_fpimm_0: ; GFX10: ; %bb.0: @@ -13386,19 +14181,27 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fadd_bf16_fpimm_1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_add_f32_e32 v0, 0x42280000, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fadd_bf16_fpimm_1: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fadd_bf16_fpimm_1: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fadd_bf16_fpimm_1: ; GFX10: ; %bb.0: @@ -13487,20 +14290,29 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_bf16: ; GFX10: ; %bb.0: @@ -13615,29 +14427,41 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_v2bf16: ; GFX10: ; %bb.0: @@ -13800,38 +14624,54 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_v3bf16: ; GFX10: ; %bb.0: @@ -14041,46 +14881,65 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fsub_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_sub_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_sub_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_sub_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fsub_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fsub_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_sub_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_sub_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fsub_v4bf16: ; GFX10: ; %bb.0: @@ -14249,20 +15108,29 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_bf16: ; GFX10: ; %bb.0: @@ -14377,29 +15245,41 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v2bf16: ; GFX10: ; %bb.0: @@ -14562,38 +15442,54 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v3bf16: ; GFX10: ; %bb.0: @@ -14803,46 +15699,65 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v4bf16: ; GFX10: ; %bb.0: @@ -15166,80 +16081,113 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_mul_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_mul_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_mul_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_mul_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_mul_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_mul_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v8bf16: ; GFX10: ; %bb.0: @@ -15855,148 +16803,209 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_mul_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_mul_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_mul_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_mul_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_mul_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_mul_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_mul_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_mul_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_mul_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_mul_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_mul_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_mul_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_mul_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_mul_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_mul_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_mul_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_mul_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_mul_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_mul_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v16bf16: ; GFX10: ; %bb.0: @@ -17311,286 +18320,407 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmul_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_mul_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_mul_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_mul_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_mul_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_mul_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_mul_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_mul_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_mul_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_mul_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_mul_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_mul_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_mul_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_mul_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_mul_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_mul_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_mul_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmul_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_mul_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_mul_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_mul_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_mul_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_mul_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_mul_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_mul_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_mul_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_mul_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_mul_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_mul_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_mul_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_mul_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_mul_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_mul_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmul_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_mul_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_mul_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_mul_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_mul_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_mul_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_mul_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_mul_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_mul_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_mul_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_mul_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_mul_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_mul_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_mul_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_mul_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_mul_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_mul_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_mul_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_mul_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_mul_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_mul_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_mul_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_mul_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_mul_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_mul_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_mul_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_mul_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_mul_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_mul_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_mul_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmul_v32bf16: ; GFX10: ; %bb.0: @@ -18524,30 +19654,50 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fdiv_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 -; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_rcp_f32_e32 v4, v2 -; GFX9-NEXT: v_fma_f32 v5, -v2, v4, 1.0 -; GFX9-NEXT: v_fma_f32 v4, v5, v4, v4 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v4 -; GFX9-NEXT: v_fma_f32 v6, -v2, v5, v3 -; GFX9-NEXT: v_fma_f32 v5, v6, v4, v5 -; GFX9-NEXT: v_fma_f32 v2, -v2, v5, v3 -; GFX9-NEXT: v_div_fmas_f32 v2, v2, v4, v5 -; GFX9-NEXT: v_div_fixup_f32 v0, v2, v1, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fdiv_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0 +; GFX900-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_rcp_f32_e32 v4, v2 +; GFX900-NEXT: v_fma_f32 v5, -v2, v4, 1.0 +; GFX900-NEXT: v_fma_f32 v4, v5, v4, v4 +; GFX900-NEXT: v_mul_f32_e32 v5, v3, v4 +; GFX900-NEXT: v_fma_f32 v6, -v2, v5, v3 +; GFX900-NEXT: v_fma_f32 v5, v6, v4, v5 +; GFX900-NEXT: v_fma_f32 v2, -v2, v5, v3 +; GFX900-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; GFX900-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fdiv_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0 +; GFX950-NEXT: v_rcp_f32_e32 v3, v2 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0 +; GFX950-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX950-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX950-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX950-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX950-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX950-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fdiv_bf16: ; GFX10: ; %bb.0: @@ -18996,20 +20146,29 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_bf16: ; GFX10: ; %bb.0: @@ -19124,29 +20283,41 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v2bf16: ; GFX10: ; %bb.0: @@ -19309,38 +20480,54 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v3bf16: ; GFX10: ; %bb.0: @@ -19550,46 +20737,65 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v4bf16: ; GFX10: ; %bb.0: @@ -19913,80 +21119,113 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_min_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_min_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_min_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v8bf16: ; GFX10: ; %bb.0: @@ -20602,148 +21841,209 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_min_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_min_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_min_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_min_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_min_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_min_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_min_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_min_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_min_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_min_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_min_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v16bf16: ; GFX10: ; %bb.0: @@ -22058,286 +23358,407 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_minnum_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_min_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_min_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_min_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_min_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_min_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_min_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_min_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_min_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_min_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_min_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_min_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_min_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_min_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_min_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_min_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_min_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_min_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_minnum_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_min_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_min_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_min_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_min_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_min_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_min_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_min_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_min_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_min_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_min_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_min_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_min_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_min_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_min_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_min_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_min_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_min_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_minnum_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_min_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_min_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_min_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_min_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_min_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_min_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_min_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_min_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_min_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_min_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_min_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_min_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_min_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_min_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_min_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_min_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_min_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_min_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_min_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minnum_v32bf16: ; GFX10: ; %bb.0: @@ -23250,20 +24671,29 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_bf16: ; GFX10: ; %bb.0: @@ -23378,29 +24808,41 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v2, v3, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v2, v3, v2 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v2bf16: ; GFX10: ; %bb.0: @@ -23563,38 +25005,54 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v3bf16: ; GFX10: ; %bb.0: @@ -23804,46 +25262,65 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v4, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v4, v5, v4 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v4, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v4, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v4, v5, v4 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v4bf16: ; GFX10: ; %bb.0: @@ -24167,80 +25644,113 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v3, v3, v8, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_max_f32_e32 v8, v9, v8 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v9, v8, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 -; GFX9-NEXT: v_add3_u32 v9, v9, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v7, v7, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX9-NEXT: v_max_f32_e32 v7, v9, v7 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v9, v7, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX9-NEXT: v_add3_u32 v9, v9, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_bfe_u32 v6, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v6, v6, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v6, v9, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 -; GFX9-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v5, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v5, v5, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v5, v9, v5 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 -; GFX9-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v5, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v8, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; GFX900-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX900-NEXT: v_add3_u32 v9, v9, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX900-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX900-NEXT: v_add3_u32 v9, v9, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX900-NEXT: v_add3_u32 v9, v9, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v9, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v8, v9, v8 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_max_f32_e32 v7, v9, v7 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v6, v9, v6 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v5, v9, v5 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v8bf16: ; GFX10: ; %bb.0: @@ -24856,148 +26366,209 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v7, v7, v16, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX9-NEXT: v_max_f32_e32 v16, v17, v16 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v17, v16, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_max_f32_e32 v7, v7, v15 -; GFX9-NEXT: v_add3_u32 v17, v17, v16, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v16 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 -; GFX9-NEXT: v_bfe_u32 v15, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v6 -; GFX9-NEXT: v_max_f32_e32 v15, v17, v15 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v17, v15, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v14 -; GFX9-NEXT: v_add3_u32 v17, v17, v15, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v14, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v14, v14, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 16, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v5 -; GFX9-NEXT: v_max_f32_e32 v14, v17, v14 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v17, v14, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v13 -; GFX9-NEXT: v_add3_u32 v17, v17, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_bfe_u32 v13, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v13, v13, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v4 -; GFX9-NEXT: v_max_f32_e32 v13, v17, v13 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v17, v13, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v12 -; GFX9-NEXT: v_add3_u32 v17, v17, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_bfe_u32 v12, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v12, v12, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v3 -; GFX9-NEXT: v_max_f32_e32 v12, v17, v12 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v17, v12, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v11 -; GFX9-NEXT: v_add3_u32 v17, v17, v12, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_bfe_u32 v11, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v11, v11, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX9-NEXT: v_max_f32_e32 v11, v17, v11 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v17, v11, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v10 -; GFX9-NEXT: v_add3_u32 v17, v17, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v10, v10, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v10, v17, v10 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v17, v10, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v9 -; GFX9-NEXT: v_add3_u32 v17, v17, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v9, v9, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 16, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v9, v17, v9 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v17, v9, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v8 -; GFX9-NEXT: v_add3_u32 v17, v17, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_bfe_u32 v8, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc -; GFX9-NEXT: v_add3_u32 v8, v8, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v9, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v11, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v12, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v13, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v14, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v16, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX900-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v17, v16, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX900-NEXT: v_add3_u32 v17, v17, v16, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v16 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX900-NEXT: v_bfe_u32 v15, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v15, v15, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX900-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX900-NEXT: v_add3_u32 v17, v17, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_bfe_u32 v14, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v14, v14, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v14, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v5 +; GFX900-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v17, v14, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX900-NEXT: v_add3_u32 v17, v17, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_bfe_u32 v13, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v13, v13, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v4 +; GFX900-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v17, v13, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX900-NEXT: v_add3_u32 v17, v17, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_bfe_u32 v12, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v12, v12, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v12, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; GFX900-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v17, v12, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX900-NEXT: v_add3_u32 v17, v17, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v11, v11, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v11, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX900-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v17, v11, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX900-NEXT: v_add3_u32 v17, v17, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v10, v10, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v10, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v17, v10, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX900-NEXT: v_add3_u32 v17, v17, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v9, v9, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v17, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v17, v9, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX900-NEXT: v_add3_u32 v17, v17, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc +; GFX900-NEXT: v_add3_u32 v8, v8, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v8, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_max_f32_e32 v16, v17, v16 +; GFX950-NEXT: v_max_f32_e32 v7, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_max_f32_e32 v15, v17, v15 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v14, v17, v14 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_max_f32_e32 v13, v17, v13 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v12, v17, v12 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_max_f32_e32 v11, v17, v11 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v10, v17, v10 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v9, v17, v9 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v15 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v16 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v16bf16: ; GFX10: ; %bb.0: @@ -26312,286 +27883,407 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX8-NEXT: v_alignbit_b32 v15, v16, v15, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_maxnum_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v30 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v14 -; GFX9-NEXT: v_max_f32_e32 v31, v32, v31 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 -; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v14, v14, v30 -; GFX9-NEXT: v_add3_u32 v32, v32, v31, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 -; GFX9-NEXT: v_bfe_u32 v30, v14, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v30, v30, v14, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v30, 16, v29 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v13 -; GFX9-NEXT: v_max_f32_e32 v30, v32, v30 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v32, v30, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v13, v13, v29 -; GFX9-NEXT: v_add3_u32 v32, v32, v30, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v30 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 -; GFX9-NEXT: v_bfe_u32 v29, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc -; GFX9-NEXT: v_add3_u32 v29, v29, v13, s4 -; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v29, 16, v28 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 16, v12 -; GFX9-NEXT: v_max_f32_e32 v32, v32, v29 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX9-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 -; GFX9-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 -; GFX9-NEXT: v_bfe_u32 v28, v12, 16, 1 -; GFX9-NEXT: v_add3_u32 v28, v28, v12, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX9-NEXT: v_max_f32_e32 v33, v33, v34 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 -; GFX9-NEXT: v_max_f32_e32 v29, v15, v29 -; GFX9-NEXT: v_bfe_u32 v15, v33, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v33, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v33 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 -; GFX9-NEXT: v_bfe_u32 v33, v29, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc -; GFX9-NEXT: v_add3_u32 v33, v33, v29, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v29 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc -; GFX9-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX9-NEXT: v_add3_u32 v33, v33, v32, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v32 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v12 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v11 -; GFX9-NEXT: v_max_f32_e32 v28, v33, v28 -; GFX9-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 -; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 -; GFX9-NEXT: v_add3_u32 v33, v33, v28, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v28 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 -; GFX9-NEXT: v_bfe_u32 v27, v11, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v27, v27, v11, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v11 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v10 -; GFX9-NEXT: v_max_f32_e32 v27, v33, v27 -; GFX9-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 -; GFX9-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX9-NEXT: v_bfe_u32 v33, v27, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 -; GFX9-NEXT: v_add3_u32 v33, v33, v27, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v27 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 -; GFX9-NEXT: v_bfe_u32 v26, v10, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v26, v26, v10, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v10 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 16, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v9 -; GFX9-NEXT: v_max_f32_e32 v26, v33, v26 -; GFX9-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 -; GFX9-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; GFX9-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 -; GFX9-NEXT: v_add3_u32 v33, v33, v26, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v26 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 -; GFX9-NEXT: v_bfe_u32 v25, v9, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v25, v25, v9, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v9 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 16, v24 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v8 -; GFX9-NEXT: v_max_f32_e32 v25, v33, v25 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; GFX9-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 -; GFX9-NEXT: v_bfe_u32 v33, v25, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 -; GFX9-NEXT: v_add3_u32 v33, v33, v25, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v25 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 -; GFX9-NEXT: v_bfe_u32 v24, v8, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v24, v24, v8, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v7 -; GFX9-NEXT: v_max_f32_e32 v24, v33, v24 -; GFX9-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX9-NEXT: v_bfe_u32 v33, v24, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 -; GFX9-NEXT: v_add3_u32 v33, v33, v24, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v24 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 -; GFX9-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v23, v23, v7, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v7 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v6 -; GFX9-NEXT: v_max_f32_e32 v23, v33, v23 -; GFX9-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_bfe_u32 v33, v23, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 -; GFX9-NEXT: v_add3_u32 v33, v33, v23, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v23 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 -; GFX9-NEXT: v_bfe_u32 v22, v6, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v22, v22, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v5 -; GFX9-NEXT: v_max_f32_e32 v22, v33, v22 -; GFX9-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_bfe_u32 v33, v22, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 -; GFX9-NEXT: v_add3_u32 v33, v33, v22, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v22 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 -; GFX9-NEXT: v_bfe_u32 v21, v5, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v21, v21, v5, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v5 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 16, v20 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: v_max_f32_e32 v21, v33, v21 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_bfe_u32 v33, v21, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 -; GFX9-NEXT: v_add3_u32 v33, v33, v21, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v21 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 -; GFX9-NEXT: v_bfe_u32 v20, v4, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v20, v20, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 16, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v3 -; GFX9-NEXT: v_max_f32_e32 v20, v33, v20 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 -; GFX9-NEXT: v_add3_u32 v33, v33, v20, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v20 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 -; GFX9-NEXT: v_bfe_u32 v19, v3, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v19, v19, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 16, v18 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: v_max_f32_e32 v19, v33, v19 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v33, v19, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 -; GFX9-NEXT: v_add3_u32 v33, v33, v19, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v19 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 -; GFX9-NEXT: v_bfe_u32 v18, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v18, v18, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v1 -; GFX9-NEXT: v_max_f32_e32 v18, v33, v18 -; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v33, v18, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 -; GFX9-NEXT: v_add3_u32 v33, v33, v18, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v18 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 -; GFX9-NEXT: v_bfe_u32 v17, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v17, v17, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v33, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v17, v33, v17 -; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v33, v17, 16, 1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 -; GFX9-NEXT: v_add3_u32 v33, v33, v17, s4 -; GFX9-NEXT: v_or_b32_e32 v34, 0x400000, v17 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 -; GFX9-NEXT: v_bfe_u32 v16, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc -; GFX9-NEXT: v_add3_u32 v16, v16, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v17, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v18, s4 -; GFX9-NEXT: v_perm_b32 v2, v2, v19, s4 -; GFX9-NEXT: v_perm_b32 v3, v3, v20, s4 -; GFX9-NEXT: v_perm_b32 v4, v4, v21, s4 -; GFX9-NEXT: v_perm_b32 v5, v5, v22, s4 -; GFX9-NEXT: v_perm_b32 v6, v6, v23, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v24, s4 -; GFX9-NEXT: v_perm_b32 v8, v8, v25, s4 -; GFX9-NEXT: v_perm_b32 v9, v9, v26, s4 -; GFX9-NEXT: v_perm_b32 v10, v10, v27, s4 -; GFX9-NEXT: v_perm_b32 v11, v11, v28, s4 -; GFX9-NEXT: v_perm_b32 v12, v12, v32, s4 -; GFX9-NEXT: v_perm_b32 v13, v13, v30, s4 -; GFX9-NEXT: v_perm_b32 v14, v14, v31, s4 -; GFX9-NEXT: v_perm_b32 v15, v29, v15, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_maxnum_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX900-NEXT: v_max_f32_e32 v31, v32, v31 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v32, v31, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX900-NEXT: v_add3_u32 v32, v32, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: v_bfe_u32 v30, v14, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v30, v30, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v30, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX900-NEXT: v_max_f32_e32 v30, v32, v30 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v30 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_bfe_u32 v29, v13, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc +; GFX900-NEXT: v_add3_u32 v29, v29, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v29, v32, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX900-NEXT: v_max_f32_e32 v32, v32, v29 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX900-NEXT: v_bfe_u32 v28, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v28, v28, v12, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v29 +; GFX900-NEXT: v_max_f32_e32 v33, v33, v34 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_max_f32_e32 v29, v15, v29 +; GFX900-NEXT: v_bfe_u32 v15, v33, 16, 1 +; GFX900-NEXT: v_add3_u32 v15, v15, v33, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v33 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX900-NEXT: v_bfe_u32 v33, v29, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v15, v34, vcc +; GFX900-NEXT: v_add3_u32 v33, v33, v29, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v29 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v33, v34, vcc +; GFX900-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v33, v33, v32, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v32 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v32, v33, v34, vcc +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v28, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v11 +; GFX900-NEXT: v_max_f32_e32 v28, v33, v28 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX900-NEXT: v_add3_u32 v33, v33, v28, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v28 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_bfe_u32 v27, v11, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v27, v27, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v27, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v10 +; GFX900-NEXT: v_max_f32_e32 v27, v33, v27 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_bfe_u32 v33, v27, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX900-NEXT: v_add3_u32 v33, v33, v27, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v27 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_bfe_u32 v26, v10, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v26, v26, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v26, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v9 +; GFX900-NEXT: v_max_f32_e32 v26, v33, v26 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX900-NEXT: v_add3_u32 v33, v33, v26, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v26 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_bfe_u32 v25, v9, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v25, v25, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v25, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v8 +; GFX900-NEXT: v_max_f32_e32 v25, v33, v25 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_bfe_u32 v33, v25, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX900-NEXT: v_add3_u32 v33, v33, v25, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v25 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_bfe_u32 v24, v8, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v24, v24, v8, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v24, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v7 +; GFX900-NEXT: v_max_f32_e32 v24, v33, v24 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_bfe_u32 v33, v24, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX900-NEXT: v_add3_u32 v33, v33, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_bfe_u32 v23, v7, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v23, v23, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v23, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v6 +; GFX900-NEXT: v_max_f32_e32 v23, v33, v23 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_bfe_u32 v33, v23, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX900-NEXT: v_add3_u32 v33, v33, v23, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v23 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_bfe_u32 v22, v6, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v22, v22, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v22, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; GFX900-NEXT: v_max_f32_e32 v22, v33, v22 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_bfe_u32 v33, v22, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX900-NEXT: v_add3_u32 v33, v33, v22, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v22 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_bfe_u32 v21, v5, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v21, v21, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v21, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v4 +; GFX900-NEXT: v_max_f32_e32 v21, v33, v21 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_bfe_u32 v33, v21, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX900-NEXT: v_add3_u32 v33, v33, v21, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v21 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v20, v20, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v20, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v3 +; GFX900-NEXT: v_max_f32_e32 v20, v33, v20 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX900-NEXT: v_add3_u32 v33, v33, v20, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v20 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_bfe_u32 v19, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v19, v19, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v19, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v2 +; GFX900-NEXT: v_max_f32_e32 v19, v33, v19 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX900-NEXT: v_add3_u32 v33, v33, v19, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v19 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_bfe_u32 v18, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v18, v18, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v18, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; GFX900-NEXT: v_max_f32_e32 v18, v33, v18 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v33, v18, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX900-NEXT: v_add3_u32 v33, v33, v18, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v18 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_bfe_u32 v17, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v17, v17, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v17, v33, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v33, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v17, v33, v17 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v33, v17, 16, 1 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX900-NEXT: v_add3_u32 v33, v33, v17, s4 +; GFX900-NEXT: v_or_b32_e32 v34, 0x400000, v17 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v33, v34, vcc +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v33, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v33, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v32, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v29, v15, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_maxnum_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v54, 0xffff0000, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v7 +; GFX950-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v35, 0xffff0000, v29 +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v28 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v27 +; GFX950-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v49, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v50, 0xffff0000, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v52, 0xffff0000, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_max_f32_e32 v33, v34, v33 +; GFX950-NEXT: v_max_f32_e32 v14, v14, v30 +; GFX950-NEXT: v_max_f32_e32 v30, v36, v35 +; GFX950-NEXT: v_max_f32_e32 v13, v13, v29 +; GFX950-NEXT: v_max_f32_e32 v29, v38, v37 +; GFX950-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX950-NEXT: v_max_f32_e32 v28, v48, v39 +; GFX950-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX950-NEXT: v_max_f32_e32 v27, v50, v49 +; GFX950-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX950-NEXT: v_max_f32_e32 v26, v52, v51 +; GFX950-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX950-NEXT: v_max_f32_e32 v25, v54, v53 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v8, v25 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v9, v26 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v10, v27 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v11, v28 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v12, v29 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v13, v30 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v14, v33 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v31 +; GFX950-NEXT: v_max_f32_e32 v24, v32, v24 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_max_f32_e32 v23, v32, v23 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_max_f32_e32 v22, v32, v22 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_max_f32_e32 v21, v32, v21 +; GFX950-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_max_f32_e32 v20, v32, v20 +; GFX950-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_max_f32_e32 v19, v32, v19 +; GFX950-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX950-NEXT: v_max_f32_e32 v18, v32, v18 +; GFX950-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v15, v15, v31 +; GFX950-NEXT: v_max_f32_e32 v31, v40, v55 +; GFX950-NEXT: v_max_f32_e32 v17, v32, v17 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v17 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v18 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v2, v19 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, v20 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, v21 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v5, v22 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, v23 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v7, v31 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v15, v24 +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maxnum_v32bf16: ; GFX10: ; %bb.0: @@ -27543,36 +29235,66 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sqrt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xf800000 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_sqrt_f32_e32 v1, v0 -; GFX9-NEXT: v_add_u32_e32 v2, -1, v1 -; GFX9-NEXT: v_fma_f32 v3, -v2, v1, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; GFX9-NEXT: v_fma_f32 v1, -v3, v1, v0 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] -; GFX9-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0x260 -; GFX9-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sqrt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0xf800000 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: v_sqrt_f32_e32 v1, v0 +; GFX900-NEXT: v_add_u32_e32 v2, -1, v1 +; GFX900-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3 +; GFX900-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] +; GFX900-NEXT: v_fma_f32 v1, -v3, v1, v0 +; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5] +; GFX900-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, 0x260 +; GFX900-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sqrt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0xf800000 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX950-NEXT: v_sqrt_f32_e32 v1, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_add_u32_e32 v2, -1, v1 +; GFX950-NEXT: v_fma_f32 v3, -v2, v1, v0 +; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v3 +; GFX950-NEXT: v_add_u32_e32 v3, 1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[0:1] +; GFX950-NEXT: v_fma_f32 v1, -v3, v1, v0 +; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +; GFX950-NEXT: v_mul_f32_e32 v2, 0x37800000, v1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, 0x260 +; GFX950-NEXT: v_cmp_class_f32_e32 vcc, v0, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sqrt_bf16: ; GFX10: ; %bb.0: @@ -27715,19 +29437,27 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_ldexp_bf16_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_ldexp_bf16_i32: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_ldexp_bf16_i32: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ldexp_bf16_i32: ; GFX10: ; %bb.0: @@ -27820,20 +29550,29 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_frexp_bf16_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v1 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_frexp_bf16_i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_frexp_bf16_i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_frexp_bf16_i16: ; GFX10: ; %bb.0: @@ -27979,35 +29718,61 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_log_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x3f317217 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x3377d1cf -; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_log_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_log_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x3f317217 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX900-NEXT: v_fma_f32 v2, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x3377d1cf +; GFX900-NEXT: v_fma_f32 v2, v0, s4, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_log_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x800000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x3f317217 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_log_f32_e32 v0, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX950-NEXT: v_fma_f32 v2, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x7f800000 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x41b17218 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_log_bf16: ; GFX10: ; %bb.0: @@ -28153,26 +29918,42 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_log2_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_log2_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_log_f32_e32 v0, v0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_log2_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x800000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v1, 0x42000000 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_log_f32_e32 v0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_log2_bf16: ; GFX10: ; %bb.0: @@ -28329,35 +30110,61 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_log10_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x800000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_log_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x3e9a209a -; GFX9-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX9-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x3284fbcf -; GFX9-NEXT: v_fma_f32 v2, v0, s4, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x7f800000 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v1, 0x411a209b -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_log10_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x800000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_log_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x3e9a209a +; GFX900-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX900-NEXT: v_fma_f32 v2, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x3284fbcf +; GFX900-NEXT: v_fma_f32 v2, v0, s4, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX900-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_log10_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x800000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x3e9a209a +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_log_f32_e32 v0, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX950-NEXT: v_fma_f32 v2, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x7f800000 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX950-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, 0x411a209b +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_log10_bf16: ; GFX10: ; %bb.0: @@ -28541,36 +30348,61 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_exp_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x3fb8aa3b -; GFX9-NEXT: v_rndne_f32_e32 v2, v1 -; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2 -; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x32a5705f -; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0xc2ce8ed0 -; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x42b17218 -; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_exp_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x3fb8aa3b +; GFX900-NEXT: v_rndne_f32_e32 v2, v1 +; GFX900-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX900-NEXT: v_fma_f32 v1, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x32a5705f +; GFX900-NEXT: v_fma_f32 v1, v0, s4, v1 +; GFX900-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX900-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0xc2ce8ed0 +; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x42b17218 +; GFX900-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_exp_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x3fb8aa3b +; GFX950-NEXT: v_rndne_f32_e32 v2, v1 +; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x32a5705f, v1 +; GFX950-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX950-NEXT: v_exp_f32_e32 v1, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0xc2ce8ed0 +; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x42b17218 +; GFX950-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_exp_bf16: ; GFX10: ; %bb.0: @@ -28722,27 +30554,43 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_exp2_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_mov_b32 s4, 0xc2fc0000 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0x42800000 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_exp_f32_e32 v0, v0 -; GFX9-NEXT: v_not_b32_e32 v1, 63 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_exp2_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0xc2fc0000 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX900-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_exp_f32_e32 v0, v0 +; GFX900-NEXT: v_not_b32_e32 v1, 63 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_exp2_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: s_mov_b32 s0, 0xc2fc0000 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: v_mov_b32_e32 v2, 0x42800000 +; GFX950-NEXT: v_not_b32_e32 v1, 63 +; GFX950-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_exp_f32_e32 v0, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_exp2_bf16: ; GFX10: ; %bb.0: @@ -28900,36 +30748,61 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_exp10_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x40549a78 -; GFX9-NEXT: v_rndne_f32_e32 v2, v1 -; GFX9-NEXT: v_sub_f32_e32 v3, v1, v2 -; GFX9-NEXT: v_fma_f32 v1, v0, s4, -v1 -; GFX9-NEXT: s_mov_b32 s4, 0x33979a37 -; GFX9-NEXT: v_fma_f32 v1, v0, s4, v1 -; GFX9-NEXT: v_add_f32_e32 v1, v3, v1 -; GFX9-NEXT: v_exp_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0xc23369f4 -; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x421a209b -; GFX9-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_exp10_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x40549a78 +; GFX900-NEXT: v_rndne_f32_e32 v2, v1 +; GFX900-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX900-NEXT: v_fma_f32 v1, v0, s4, -v1 +; GFX900-NEXT: s_mov_b32 s4, 0x33979a37 +; GFX900-NEXT: v_fma_f32 v1, v0, s4, v1 +; GFX900-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX900-NEXT: v_exp_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0xc23369f4 +; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x421a209b +; GFX900-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_exp10_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x40549a78 +; GFX950-NEXT: v_rndne_f32_e32 v2, v1 +; GFX950-NEXT: v_sub_f32_e32 v3, v1, v2 +; GFX950-NEXT: v_fma_f32 v1, v0, s0, -v1 +; GFX950-NEXT: v_fmamk_f32 v1, v0, 0x33979a37, v1 +; GFX950-NEXT: v_add_f32_e32 v1, v3, v1 +; GFX950-NEXT: v_exp_f32_e32 v1, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0xc23369f4 +; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x421a209b +; GFX950-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX950-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, s0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_exp10_bf16: ; GFX10: ; %bb.0: @@ -29059,19 +30932,27 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_ceil_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_ceil_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_ceil_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_ceil_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_ceil_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_ceil_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_ceil_bf16: ; GFX10: ; %bb.0: @@ -29157,19 +31038,27 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_trunc_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_trunc_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_trunc_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_trunc_bf16: ; GFX10: ; %bb.0: @@ -29255,19 +31144,27 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_rint_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_rndne_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_rint_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_rndne_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_rint_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_rndne_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_rint_bf16: ; GFX10: ; %bb.0: @@ -29353,19 +31250,27 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_nearbyint_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_rndne_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_nearbyint_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_rndne_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_nearbyint_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_rndne_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_nearbyint_bf16: ; GFX10: ; %bb.0: @@ -29469,25 +31374,40 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_round_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v0 -; GFX9-NEXT: v_sub_f32_e32 v2, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] -; GFX9-NEXT: s_brev_b32 s4, -2 -; GFX9-NEXT: v_bfi_b32 v0, s4, v2, v0 -; GFX9-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_round_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v1, v0 +; GFX900-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX900-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; GFX900-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; GFX900-NEXT: s_brev_b32 s4, -2 +; GFX900-NEXT: v_bfi_b32 v0, s4, v2, v0 +; GFX900-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_round_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v1, v0 +; GFX950-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] +; GFX950-NEXT: s_brev_b32 s0, -2 +; GFX950-NEXT: v_bfi_b32 v0, s0, v2, v0 +; GFX950-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_round_bf16: ; GFX10: ; %bb.0: @@ -29592,19 +31512,27 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_roundeven_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_rndne_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_roundeven_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_rndne_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_roundeven_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_rndne_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_roundeven_bf16: ; GFX10: ; %bb.0: @@ -29690,19 +31618,27 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_floor_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_floor_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_floor_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_floor_bf16: ; GFX10: ; %bb.0: @@ -29786,19 +31722,27 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_canonicalize_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_canonicalize_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_canonicalize_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_canonicalize_bf16: ; GFX10: ; %bb.0: @@ -29929,14 +31873,24 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_oeq_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_oeq_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_oeq_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_oeq_bf16: ; GFX10: ; %bb.0: @@ -30004,14 +31958,24 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ogt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ogt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ogt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ogt_bf16: ; GFX10: ; %bb.0: @@ -30079,14 +32043,24 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_oge_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_oge_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_oge_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_oge_bf16: ; GFX10: ; %bb.0: @@ -30154,14 +32128,24 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_olt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_olt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_olt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_olt_bf16: ; GFX10: ; %bb.0: @@ -30229,14 +32213,24 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ole_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ole_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ole_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_le_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ole_bf16: ; GFX10: ; %bb.0: @@ -30304,14 +32298,24 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_one_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_one_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_one_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_one_bf16: ; GFX10: ; %bb.0: @@ -30379,14 +32383,24 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_uno_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_uno_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_uno_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_u_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_uno_bf16: ; GFX10: ; %bb.0: @@ -30454,14 +32468,24 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ueq_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ueq_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ueq_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ueq_bf16: ; GFX10: ; %bb.0: @@ -30529,14 +32553,24 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ugt_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ugt_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ugt_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ugt_bf16: ; GFX10: ; %bb.0: @@ -30604,14 +32638,24 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_uge_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_uge_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_uge_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_uge_bf16: ; GFX10: ; %bb.0: @@ -30679,14 +32723,24 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ult_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ult_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ult_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ult_bf16: ; GFX10: ; %bb.0: @@ -30754,14 +32808,24 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_ule_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_ule_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_ule_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_ule_bf16: ; GFX10: ; %bb.0: @@ -30829,14 +32893,24 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fcmp_une_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fcmp_une_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 +; GFX900-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fcmp_une_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fcmp_une_bf16: ; GFX10: ; %bb.0: @@ -31011,16 +33085,27 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v2bf16_to_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v1, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i16: ; GFX10: ; %bb.0: @@ -31110,18 +33195,31 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { ; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v3bf16_to_v3i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i16: ; GFX10: ; %bb.0: @@ -31232,21 +33330,37 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v4bf16_to_v4i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX900-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX900-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX900-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX950-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX950-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v3, s0 +; GFX950-NEXT: v_perm_b32 v1, v1, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i16: ; GFX10: ; %bb.0: @@ -31663,24 +33777,44 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_bf16_to_i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v1, |v0|, s4 -; GFX9-NEXT: v_floor_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0xcf800000 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 -; GFX9-NEXT: v_fma_f32 v1, v1, s4, |v0| -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, v1, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_bf16_to_i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v0, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v1, |v0|, s4 +; GFX900-NEXT: v_floor_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0xcf800000 +; GFX900-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX900-NEXT: v_fma_f32 v1, v1, s4, |v0| +; GFX900-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX900-NEXT: v_xor_b32_e32 v2, v2, v3 +; GFX900-NEXT: v_xor_b32_e32 v0, v1, v3 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 +; GFX900-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_bf16_to_i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v1, |v0|, s0 +; GFX950-NEXT: v_floor_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0xcf800000 +; GFX950-NEXT: v_cvt_u32_f32_e32 v2, v1 +; GFX950-NEXT: v_fma_f32 v1, v1, s0, |v0| +; GFX950-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX950-NEXT: v_xor_b32_e32 v2, v2, v3 +; GFX950-NEXT: v_xor_b32_e32 v0, v1, v3 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_bf16_to_i64: ; GFX10: ; %bb.0: @@ -31845,36 +33979,69 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v2bf16_to_v2i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v2, |v1|, s4 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_floor_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX9-NEXT: v_trunc_f32_e32 v4, v0 -; GFX9-NEXT: v_fma_f32 v3, v2, s5, |v1| -; GFX9-NEXT: v_mul_f32_e64 v0, |v4|, s4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_fma_f32 v5, v0, s5, |v4| -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v0 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, v2, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, v5, v3 -; GFX9-NEXT: v_xor_b32_e32 v4, v6, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v2bf16_to_v2i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v2, |v1|, s4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_floor_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s5, 0xcf800000 +; GFX900-NEXT: v_trunc_f32_e32 v4, v0 +; GFX900-NEXT: v_fma_f32 v3, v2, s5, |v1| +; GFX900-NEXT: v_mul_f32_e64 v0, |v4|, s4 +; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX900-NEXT: v_fma_f32 v5, v0, s5, |v4| +; GFX900-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX900-NEXT: v_xor_b32_e32 v3, v3, v1 +; GFX900-NEXT: v_xor_b32_e32 v2, v2, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX900-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX900-NEXT: v_xor_b32_e32 v2, v5, v3 +; GFX900-NEXT: v_xor_b32_e32 v4, v6, v3 +; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v2bf16_to_v2i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v2, |v1|, s0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_floor_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s1, 0xcf800000 +; GFX950-NEXT: v_trunc_f32_e32 v4, v0 +; GFX950-NEXT: v_fma_f32 v3, v2, s1, |v1| +; GFX950-NEXT: v_mul_f32_e64 v0, |v4|, s0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX950-NEXT: v_fma_f32 v5, v0, s1, |v4| +; GFX950-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v0 +; GFX950-NEXT: v_xor_b32_e32 v3, v3, v1 +; GFX950-NEXT: v_xor_b32_e32 v2, v2, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX950-NEXT: v_xor_b32_e32 v2, v5, v3 +; GFX950-NEXT: v_xor_b32_e32 v4, v6, v3 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v2bf16_to_v2i64: ; GFX10: ; %bb.0: @@ -32082,49 +34249,96 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v6 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v3bf16_to_v3i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4 -; GFX9-NEXT: v_floor_f32_e32 v3, v3 -; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2| -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_trunc_f32_e32 v5, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5| -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX9-NEXT: v_mul_f32_e64 v5, |v1|, s4 -; GFX9-NEXT: v_floor_f32_e32 v5, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, v7, v3 -; GFX9-NEXT: v_fma_f32 v7, v5, s5, |v1| -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_xor_b32_e32 v4, v8, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, v7, v1 -; GFX9-NEXT: v_xor_b32_e32 v5, v5, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v3bf16_to_v3i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v3, |v2|, s4 +; GFX900-NEXT: v_floor_f32_e32 v3, v3 +; GFX900-NEXT: s_mov_b32 s5, 0xcf800000 +; GFX900-NEXT: v_fma_f32 v4, v3, s5, |v2| +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX900-NEXT: v_trunc_f32_e32 v5, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX900-NEXT: v_mul_f32_e64 v0, |v5|, s4 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX900-NEXT: v_fma_f32 v6, v0, s5, |v5| +; GFX900-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v8, v0 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX900-NEXT: v_trunc_f32_e32 v1, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX900-NEXT: v_mul_f32_e64 v5, |v1|, s4 +; GFX900-NEXT: v_floor_f32_e32 v5, v5 +; GFX900-NEXT: v_xor_b32_e32 v2, v7, v3 +; GFX900-NEXT: v_fma_f32 v7, v5, s5, |v1| +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX900-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX900-NEXT: v_xor_b32_e32 v4, v8, v3 +; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX900-NEXT: v_xor_b32_e32 v4, v7, v1 +; GFX900-NEXT: v_xor_b32_e32 v5, v5, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, v6 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v3bf16_to_v3i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v3, |v2|, s0 +; GFX950-NEXT: v_floor_f32_e32 v3, v3 +; GFX950-NEXT: s_mov_b32 s1, 0xcf800000 +; GFX950-NEXT: v_fma_f32 v4, v3, s1, |v2| +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX950-NEXT: v_trunc_f32_e32 v5, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX950-NEXT: v_mul_f32_e64 v0, |v5|, s0 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX950-NEXT: v_fma_f32 v6, v0, s1, |v5| +; GFX950-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v8, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX950-NEXT: v_trunc_f32_e32 v1, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v6, vcc, v3, v2, vcc +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX950-NEXT: v_mul_f32_e64 v5, |v1|, s0 +; GFX950-NEXT: v_floor_f32_e32 v5, v5 +; GFX950-NEXT: v_xor_b32_e32 v2, v7, v3 +; GFX950-NEXT: v_fma_f32 v7, v5, s1, |v1| +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX950-NEXT: v_cvt_u32_f32_e32 v5, v5 +; GFX950-NEXT: v_xor_b32_e32 v4, v8, v3 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX950-NEXT: v_xor_b32_e32 v4, v7, v1 +; GFX950-NEXT: v_xor_b32_e32 v5, v5, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v3bf16_to_v3i64: ; GFX10: ; %bb.0: @@ -32393,61 +34607,120 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v8 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fptosi_v4bf16_to_v4i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_mov_b32 s4, 0x2f800000 -; GFX9-NEXT: v_mul_f32_e64 v3, |v2|, s4 -; GFX9-NEXT: v_floor_f32_e32 v3, v3 -; GFX9-NEXT: s_mov_b32 s5, 0xcf800000 -; GFX9-NEXT: v_fma_f32 v4, v3, s5, |v2| -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_trunc_f32_e32 v5, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_mul_f32_e64 v0, |v5|, s4 -; GFX9-NEXT: v_floor_f32_e32 v0, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; GFX9-NEXT: v_fma_f32 v6, v0, s5, |v5| -; GFX9-NEXT: v_xor_b32_e32 v4, v4, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_xor_b32_e32 v3, v3, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc -; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_xor_b32_e32 v2, v6, v3 -; GFX9-NEXT: v_mul_f32_e64 v6, |v5|, s4 -; GFX9-NEXT: v_floor_f32_e32 v6, v6 -; GFX9-NEXT: v_xor_b32_e32 v4, v7, v3 -; GFX9-NEXT: v_fma_f32 v7, v6, s5, |v5| -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v4, v7, v5 -; GFX9-NEXT: v_mul_f32_e64 v7, |v1|, s4 -; GFX9-NEXT: v_floor_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GFX9-NEXT: v_fma_f32 v9, v7, s5, |v1| -; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GFX9-NEXT: v_xor_b32_e32 v6, v6, v5 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v6, v9, v1 -; GFX9-NEXT: v_xor_b32_e32 v7, v7, v1 -; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, v8 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fptosi_v4bf16_to_v4i64: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: v_trunc_f32_e32 v2, v2 +; GFX900-NEXT: s_mov_b32 s4, 0x2f800000 +; GFX900-NEXT: v_mul_f32_e64 v3, |v2|, s4 +; GFX900-NEXT: v_floor_f32_e32 v3, v3 +; GFX900-NEXT: s_mov_b32 s5, 0xcf800000 +; GFX900-NEXT: v_fma_f32 v4, v3, s5, |v2| +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX900-NEXT: v_trunc_f32_e32 v5, v0 +; GFX900-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX900-NEXT: v_mul_f32_e64 v0, |v5|, s4 +; GFX900-NEXT: v_floor_f32_e32 v0, v0 +; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX900-NEXT: v_fma_f32 v6, v0, s5, |v5| +; GFX900-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX900-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v0 +; GFX900-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX900-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc +; GFX900-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX900-NEXT: v_trunc_f32_e32 v5, v5 +; GFX900-NEXT: v_xor_b32_e32 v2, v6, v3 +; GFX900-NEXT: v_mul_f32_e64 v6, |v5|, s4 +; GFX900-NEXT: v_floor_f32_e32 v6, v6 +; GFX900-NEXT: v_xor_b32_e32 v4, v7, v3 +; GFX900-NEXT: v_fma_f32 v7, v6, s5, |v5| +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX900-NEXT: v_trunc_f32_e32 v1, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX900-NEXT: v_xor_b32_e32 v4, v7, v5 +; GFX900-NEXT: v_mul_f32_e64 v7, |v1|, s4 +; GFX900-NEXT: v_floor_f32_e32 v7, v7 +; GFX900-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX900-NEXT: v_fma_f32 v9, v7, s5, |v1| +; GFX900-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GFX900-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX900-NEXT: v_xor_b32_e32 v6, v6, v5 +; GFX900-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX900-NEXT: v_xor_b32_e32 v6, v9, v1 +; GFX900-NEXT: v_xor_b32_e32 v7, v7, v1 +; GFX900-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 +; GFX900-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, v8 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fptosi_v4bf16_to_v4i64: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX950-NEXT: v_trunc_f32_e32 v2, v2 +; GFX950-NEXT: s_mov_b32 s0, 0x2f800000 +; GFX950-NEXT: v_mul_f32_e64 v3, |v2|, s0 +; GFX950-NEXT: v_floor_f32_e32 v3, v3 +; GFX950-NEXT: s_mov_b32 s1, 0xcf800000 +; GFX950-NEXT: v_fma_f32 v4, v3, s1, |v2| +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX950-NEXT: v_trunc_f32_e32 v5, v0 +; GFX950-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX950-NEXT: v_mul_f32_e64 v0, |v5|, s0 +; GFX950-NEXT: v_floor_f32_e32 v0, v0 +; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX950-NEXT: v_fma_f32 v6, v0, s1, |v5| +; GFX950-NEXT: v_xor_b32_e32 v4, v4, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX950-NEXT: v_xor_b32_e32 v3, v3, v2 +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v0 +; GFX950-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v8, vcc, v3, v2, vcc +; GFX950-NEXT: v_ashrrev_i32_e32 v3, 31, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX950-NEXT: v_trunc_f32_e32 v5, v5 +; GFX950-NEXT: v_xor_b32_e32 v2, v6, v3 +; GFX950-NEXT: v_mul_f32_e64 v6, |v5|, s0 +; GFX950-NEXT: v_floor_f32_e32 v6, v6 +; GFX950-NEXT: v_xor_b32_e32 v4, v7, v3 +; GFX950-NEXT: v_fma_f32 v7, v6, s1, |v5| +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX950-NEXT: v_trunc_f32_e32 v1, v1 +; GFX950-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX950-NEXT: v_xor_b32_e32 v4, v7, v5 +; GFX950-NEXT: v_mul_f32_e64 v7, |v1|, s0 +; GFX950-NEXT: v_floor_f32_e32 v7, v7 +; GFX950-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX950-NEXT: v_fma_f32 v9, v7, s1, |v1| +; GFX950-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GFX950-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX950-NEXT: v_xor_b32_e32 v6, v6, v5 +; GFX950-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc +; GFX950-NEXT: v_xor_b32_e32 v6, v9, v1 +; GFX950-NEXT: v_xor_b32_e32 v7, v7, v1 +; GFX950-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v1, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, v8 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fptosi_v4bf16_to_v4i64: ; GFX10: ; %bb.0: @@ -32594,18 +34867,25 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_i16_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_i16_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_i16_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i16_to_bf16: ; GFX10: ; %bb.0: @@ -32698,25 +34978,33 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v2i16_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v2i16_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v2i16_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i16_to_v2bf16: ; GFX10: ; %bb.0: @@ -32846,32 +35134,42 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v3i16_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v3i16_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v3i16_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i16_to_v3bf16: ; GFX10: ; %bb.0: @@ -33042,38 +35340,49 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v4i16_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v4i16_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v4i16_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i16_to_v4bf16: ; GFX10: ; %bb.0: @@ -33219,18 +35528,25 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_i32_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_i32_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_i32_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i32_to_bf16: ; GFX10: ; %bb.0: @@ -33315,25 +35631,33 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v2i32_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v2i32_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v2i32_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i32_to_v2bf16: ; GFX10: ; %bb.0: @@ -33452,32 +35776,42 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v3i32_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v3i32_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v3i32_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i32_to_v3bf16: ; GFX10: ; %bb.0: @@ -33629,38 +35963,49 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v4i32_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v3 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v4i32_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v4i32_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX950-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i32_to_v4bf16: ; GFX10: ; %bb.0: @@ -33827,29 +36172,47 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_i64_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v2, v0, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v2 -; GFX9-NEXT: v_ffbh_i32_e32 v3, v1 -; GFX9-NEXT: v_add_u32_e32 v2, 32, v2 -; GFX9-NEXT: v_add_u32_e32 v3, -1, v3 -; GFX9-NEXT: v_min_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_i64_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX900-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX900-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX900-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX900-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_i64_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_add_u32_e32 v2, 32, v2 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_min_u32_e32 v2, v3, v2 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_i64_to_bf16: ; GFX10: ; %bb.0: @@ -34044,47 +36407,77 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v2i64_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v5, v0, v1 -; GFX9-NEXT: v_ffbh_i32_e32 v4, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 -; GFX9-NEXT: v_min_u32_e32 v4, v4, v5 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 -; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v6, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v2i64_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v5, v0, v1 +; GFX900-NEXT: v_ffbh_i32_e32 v4, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX900-NEXT: v_add_u32_e32 v4, -1, v4 +; GFX900-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX900-NEXT: v_min_u32_e32 v4, v4, v5 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX900-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX900-NEXT: v_add3_u32 v5, v0, v4, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX900-NEXT: v_min_u32_e32 v6, v0, v1 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v6 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v2i64_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v5, v2, v3 +; GFX950-NEXT: v_ffbh_i32_e32 v4, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX950-NEXT: v_add_u32_e32 v4, -1, v4 +; GFX950-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX950-NEXT: v_min_u32_e32 v4, v4, v5 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_xor_b32_e32 v5, v0, v1 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX950-NEXT: v_min_u32_e32 v3, v3, v5 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v2i64_to_v2bf16: ; GFX10: ; %bb.0: @@ -34386,65 +36779,109 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v3i64_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v7, v4, v5 -; GFX9-NEXT: v_ffbh_i32_e32 v6, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 -; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: v_xor_b32_e32 v7, v0, v1 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 -; GFX9-NEXT: v_ffbh_i32_e32 v6, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v7 -; GFX9-NEXT: v_add_u32_e32 v6, -1, v6 -; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc -; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v7, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v3i64_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v7, v4, v5 +; GFX900-NEXT: v_ffbh_i32_e32 v6, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX900-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX900-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX900-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX900-NEXT: v_xor_b32_e32 v7, v0, v1 +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX900-NEXT: v_ffbh_i32_e32 v6, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX900-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX900-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX900-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] +; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX900-NEXT: v_ldexp_f32 v5, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v5, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX900-NEXT: v_add3_u32 v6, v0, v5, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX900-NEXT: v_min_u32_e32 v7, v0, v1 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v4, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v3i64_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v7, v4, v5 +; GFX950-NEXT: v_ffbh_i32_e32 v6, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX950-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX950-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX950-NEXT: v_xor_b32_e32 v6, v2, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX950-NEXT: v_ffbh_i32_e32 v5, v3 +; GFX950-NEXT: v_add_u32_e32 v5, -1, v5 +; GFX950-NEXT: v_add_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_min_u32_e32 v5, v5, v6 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_xor_b32_e32 v6, v0, v1 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v6, 31, v6 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_add_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_min_u32_e32 v3, v3, v6 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v5 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v3i64_to_v3bf16: ; GFX10: ; %bb.0: @@ -34842,82 +37279,137 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_sitofp_v4i64_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v9, v4, v5 -; GFX9-NEXT: v_ffbh_i32_e32 v8, v5 -; GFX9-NEXT: v_ashrrev_i32_e32 v9, 31, v9 -; GFX9-NEXT: v_add_u32_e32 v8, -1, v8 -; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 -; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 -; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v5, v6, v7 -; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v4, v7 -; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v5 -; GFX9-NEXT: v_add_u32_e32 v4, -1, v4 -; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 -; GFX9-NEXT: v_min_u32_e32 v10, v4, v5 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_xor_b32_e32 v8, v0, v1 -; GFX9-NEXT: v_ffbh_i32_e32 v7, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v8 -; GFX9-NEXT: v_add_u32_e32 v7, -1, v7 -; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 -; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc -; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 -; GFX9-NEXT: v_xor_b32_e32 v1, v2, v3 -; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 -; GFX9-NEXT: v_ffbh_i32_e32 v0, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 -; GFX9-NEXT: v_add_u32_e32 v1, 32, v1 -; GFX9-NEXT: v_min_u32_e32 v8, v0, v1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_i32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_sitofp_v4i64_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX900-NEXT: v_ffbh_i32_e32 v8, v5 +; GFX900-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX900-NEXT: v_add_u32_e32 v8, -1, v8 +; GFX900-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX900-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX900-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v5, v6, v7 +; GFX900-NEXT: v_add3_u32 v9, v4, v8, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v4, v7 +; GFX900-NEXT: v_ashrrev_i32_e32 v5, 31, v5 +; GFX900-NEXT: v_add_u32_e32 v4, -1, v4 +; GFX900-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX900-NEXT: v_min_u32_e32 v10, v4, v5 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_xor_b32_e32 v8, v0, v1 +; GFX900-NEXT: v_ffbh_i32_e32 v7, v1 +; GFX900-NEXT: v_ashrrev_i32_e32 v8, 31, v8 +; GFX900-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX900-NEXT: v_add_u32_e32 v8, 32, v8 +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_min_u32_e32 v7, v7, v8 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX900-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX900-NEXT: v_sub_u32_e32 v6, 32, v10 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX900-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc +; GFX900-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX900-NEXT: v_xor_b32_e32 v1, v2, v3 +; GFX900-NEXT: v_add3_u32 v7, v0, v6, s4 +; GFX900-NEXT: v_ffbh_i32_e32 v0, v3 +; GFX900-NEXT: v_ashrrev_i32_e32 v1, 31, v1 +; GFX900-NEXT: v_add_u32_e32 v0, -1, v0 +; GFX900-NEXT: v_add_u32_e32 v1, 32, v1 +; GFX900-NEXT: v_min_u32_e32 v8, v0, v1 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v4, v5, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_sitofp_v4i64_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_xor_b32_e32 v9, v6, v7 +; GFX950-NEXT: v_ffbh_i32_e32 v8, v7 +; GFX950-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX950-NEXT: v_add_u32_e32 v8, -1, v8 +; GFX950-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX950-NEXT: v_min_u32_e32 v8, v8, v9 +; GFX950-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7] +; GFX950-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX950-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX950-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX950-NEXT: v_ffbh_i32_e32 v7, v5 +; GFX950-NEXT: v_ashrrev_i32_e32 v9, 31, v9 +; GFX950-NEXT: v_add_u32_e32 v7, -1, v7 +; GFX950-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX950-NEXT: v_min_u32_e32 v7, v7, v9 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_cvt_f32_i32_e32 v6, v6 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX950-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX950-NEXT: v_sub_u32_e32 v6, 32, v7 +; GFX950-NEXT: v_xor_b32_e32 v7, v2, v3 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX950-NEXT: v_ffbh_i32_e32 v6, v3 +; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX950-NEXT: v_add_u32_e32 v6, -1, v6 +; GFX950-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_xor_b32_e32 v7, v0, v1 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_i32_e32 v3, v1 +; GFX950-NEXT: v_ashrrev_i32_e32 v7, 31, v7 +; GFX950-NEXT: v_add_u32_e32 v3, -1, v3 +; GFX950-NEXT: v_add_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_min_u32_e32 v3, v3, v7 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v4, v5 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sitofp_v4i64_to_v4bf16: ; GFX10: ; %bb.0: @@ -35202,18 +37694,25 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_i16_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_i16_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_i16_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i16_to_bf16: ; GFX10: ; %bb.0: @@ -35306,25 +37805,33 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v2i16_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v2i16_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v2i16_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v2i16_to_v2bf16: ; GFX10: ; %bb.0: @@ -35457,32 +37964,42 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v3i16_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v3i16_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v2, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v3i16_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v3i16_to_v3bf16: ; GFX10: ; %bb.0: @@ -35656,38 +38173,49 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v4i16_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 -; GFX9-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_bfe_u32 v3, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v4i16_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX900-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v4i16_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i16_to_v4bf16: ; GFX10: ; %bb.0: @@ -35838,18 +38366,25 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_i32_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_i32_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_i32_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i32_to_bf16: ; GFX10: ; %bb.0: @@ -35934,25 +38469,33 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v2i32_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v2i32_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v2i32_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v2i32_to_v2bf16: ; GFX10: ; %bb.0: @@ -36071,32 +38614,42 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX8-NEXT: v_mov_b32_e32 v1, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v3i32_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v2, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v3i32_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX900-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v2, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v3i32_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_cvt_f32_u32_e32 v3, v1 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v3i32_to_v3bf16: ; GFX10: ; %bb.0: @@ -36248,38 +38801,49 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v4i32_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v2, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, v1 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc -; GFX9-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v4, v4, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v4i32_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX900-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_bfe_u32 v4, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc +; GFX900-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v4, v4, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v4i32_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX950-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i32_to_v4bf16: ; GFX10: ; %bb.0: @@ -36434,25 +38998,39 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_i64_to_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v2, v1 -; GFX9-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v2 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_i64_to_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX900-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_i64_to_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v2, v1 +; GFX950-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v2 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_i64_to_bf16: ; GFX10: ; %bb.0: @@ -36606,39 +39184,61 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v0, v1, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v2i64_to_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v4, v1 -; GFX9-NEXT: v_min_u32_e32 v4, 32, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v4 -; GFX9-NEXT: v_ldexp_f32 v4, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v0, v4, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v6 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v2i64_to_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v4, v1 +; GFX900-NEXT: v_min_u32_e32 v4, 32, v4 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX900-NEXT: v_ldexp_f32 v4, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v0, v4, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX900-NEXT: v_min_u32_e32 v6, 32, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v6 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v2i64_to_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v4, v3 +; GFX950-NEXT: v_min_u32_e32 v4, 32, v4 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX950-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v4 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v2i64_to_v2bf16: ; GFX10: ; %bb.0: @@ -36874,53 +39474,85 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v3i64_to_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v6, v5 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v6 -; GFX9-NEXT: v_ffbh_u32_e32 v6, v1 -; GFX9-NEXT: v_min_u32_e32 v6, 32, v6 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v5 -; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc -; GFX9-NEXT: v_ldexp_f32 v5, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v5, 16, 1 -; GFX9-NEXT: v_add3_u32 v6, v0, v5, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v5 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v7 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v4, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v3i64_to_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v6, v5 +; GFX900-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX900-NEXT: v_ffbh_u32_e32 v6, v1 +; GFX900-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v6, v[0:1] +; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX900-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc +; GFX900-NEXT: v_ldexp_f32 v5, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v6, v0, v5, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX900-NEXT: v_min_u32_e32 v7, 32, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v5 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v7 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v4, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v3i64_to_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v6, v5 +; GFX950-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v6 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v5 +; GFX950-NEXT: v_ffbh_u32_e32 v5, v3 +; GFX950-NEXT: v_min_u32_e32 v5, 32, v5 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v5, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX950-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v5 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_mov_b32_e32 v1, v4 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v3i64_to_v3bf16: ; GFX10: ; %bb.0: @@ -37236,66 +39868,105 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX8-NEXT: v_alignbit_b32 v1, v4, v5, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_uitofp_v4i64_to_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_ffbh_u32_e32 v8, v5 -; GFX9-NEXT: v_min_u32_e32 v8, 32, v8 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v8, v4, v5 -; GFX9-NEXT: v_bfe_u32 v4, v8, 16, 1 -; GFX9-NEXT: v_add3_u32 v9, v4, v8, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v4, v7 -; GFX9-NEXT: v_min_u32_e32 v10, 32, v4 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] -; GFX9-NEXT: v_ffbh_u32_e32 v7, v1 -; GFX9-NEXT: v_min_u32_e32 v4, 1, v4 -; GFX9-NEXT: v_min_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, v4 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v8 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc -; GFX9-NEXT: v_sub_u32_e32 v6, 32, v10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_ldexp_f32 v4, v4, v6 -; GFX9-NEXT: v_bfe_u32 v6, v4, 16, 1 -; GFX9-NEXT: v_add3_u32 v6, v6, v4, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX9-NEXT: v_sub_u32_e32 v1, 32, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc -; GFX9-NEXT: v_ldexp_f32 v6, v0, v1 -; GFX9-NEXT: v_bfe_u32 v0, v6, 16, 1 -; GFX9-NEXT: v_add3_u32 v7, v0, v6, s4 -; GFX9-NEXT: v_ffbh_u32_e32 v0, v3 -; GFX9-NEXT: v_min_u32_e32 v8, 32, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v6 -; GFX9-NEXT: v_min_u32_e32 v0, 1, v0 -; GFX9-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 32, v8 -; GFX9-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-NEXT: v_perm_b32 v1, v4, v5, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_uitofp_v4i64_to_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_ffbh_u32_e32 v8, v5 +; GFX900-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v8, v[4:5] +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v8, v4, v5 +; GFX900-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX900-NEXT: v_add3_u32 v9, v4, v8, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v4, v7 +; GFX900-NEXT: v_min_u32_e32 v10, 32, v4 +; GFX900-NEXT: v_lshlrev_b64 v[4:5], v10, v[6:7] +; GFX900-NEXT: v_ffbh_u32_e32 v7, v1 +; GFX900-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX900-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX900-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX900-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v8 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc +; GFX900-NEXT: v_sub_u32_e32 v6, 32, v10 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX900-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v6, v6, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_sub_u32_e32 v1, 32, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc +; GFX900-NEXT: v_ldexp_f32 v6, v0, v1 +; GFX900-NEXT: v_bfe_u32 v0, v6, 16, 1 +; GFX900-NEXT: v_add3_u32 v7, v0, v6, s4 +; GFX900-NEXT: v_ffbh_u32_e32 v0, v3 +; GFX900-NEXT: v_min_u32_e32 v8, 32, v0 +; GFX900-NEXT: v_lshlrev_b64 v[0:1], v8, v[2:3] +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX900-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX900-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX900-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc +; GFX900-NEXT: v_sub_u32_e32 v2, 32, v8 +; GFX900-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX900-NEXT: v_perm_b32 v1, v4, v5, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_uitofp_v4i64_to_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_ffbh_u32_e32 v8, v7 +; GFX950-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX950-NEXT: v_lshlrev_b64 v[6:7], v8, v[6:7] +; GFX950-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX950-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX950-NEXT: v_ffbh_u32_e32 v7, v5 +; GFX950-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX950-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] +; GFX950-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX950-NEXT: v_cvt_f32_u32_e32 v6, v6 +; GFX950-NEXT: v_or_b32_e32 v4, v5, v4 +; GFX950-NEXT: v_cvt_f32_u32_e32 v4, v4 +; GFX950-NEXT: v_sub_u32_e32 v5, 32, v8 +; GFX950-NEXT: v_ldexp_f32 v5, v6, v5 +; GFX950-NEXT: v_sub_u32_e32 v6, 32, v7 +; GFX950-NEXT: v_ldexp_f32 v4, v4, v6 +; GFX950-NEXT: v_ffbh_u32_e32 v6, v3 +; GFX950-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX950-NEXT: v_lshlrev_b64 v[2:3], v6, v[2:3] +; GFX950-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX950-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX950-NEXT: v_ffbh_u32_e32 v3, v1 +; GFX950-NEXT: v_min_u32_e32 v3, 32, v3 +; GFX950-NEXT: v_lshlrev_b64 v[0:1], v3, v[0:1] +; GFX950-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX950-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX950-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX950-NEXT: v_sub_u32_e32 v1, 32, v6 +; GFX950-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX950-NEXT: v_sub_u32_e32 v2, 32, v3 +; GFX950-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v4, v5 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_uitofp_v4i64_to_v4bf16: ; GFX10: ; %bb.0: @@ -37531,13 +40202,22 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_bf16: ; GFX10: ; %bb.0: @@ -37600,14 +40280,24 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_fneg_lhs_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_fneg_lhs_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_fneg_lhs_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_fneg_lhs_bf16: ; GFX10: ; %bb.0: @@ -37674,14 +40364,24 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_fneg_rhs_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_fneg_rhs_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_fneg_rhs_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_fneg_rhs_bf16: ; GFX10: ; %bb.0: @@ -37765,16 +40465,28 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX950-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v2bf16: ; GFX10: ; %bb.0: @@ -37859,18 +40571,32 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX900-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; GFX900-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[0:1] +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v2bf16: ; GFX10: ; %bb.0: @@ -37946,15 +40672,27 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_bf16: ; GFX10: ; %bb.0: @@ -38046,21 +40784,39 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v1, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_lshr_b32 s2, s0, 16 +; GFX900-NEXT: s_lshr_b32 s3, s1, 16 +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: v_mov_b32_e32 v2, s2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, s1 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v1, s0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_lshr_b32 s2, s0, 16 +; GFX950-NEXT: s_lshr_b32 s3, s1, 16 +; GFX950-NEXT: v_mov_b32_e32 v1, s3 +; GFX950-NEXT: v_mov_b32_e32 v2, s2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v0, v1, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v2bf16: ; GFX10: ; %bb.0: @@ -38159,22 +40915,42 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_vselect_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_vselect_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_lshr_b32 s2, s0, 16 +; GFX900-NEXT: s_lshr_b32 s3, s1, 16 +; GFX900-NEXT: v_mov_b32_e32 v2, s3 +; GFX900-NEXT: v_mov_b32_e32 v3, s2 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_mov_b32_e32 v3, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX900-NEXT: s_mov_b32 s0, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_vselect_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_lshr_b32 s2, s0, 16 +; GFX950-NEXT: s_lshr_b32 s3, s1, 16 +; GFX950-NEXT: v_mov_b32_e32 v2, s3 +; GFX950-NEXT: v_mov_b32_e32 v3, s2 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_mov_b32_e32 v3, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_vselect_v2bf16: ; GFX10: ; %bb.0: @@ -38285,14 +41061,24 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v3bf16: ; GFX10: ; %bb.0: @@ -38383,14 +41169,24 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v4bf16: ; GFX10: ; %bb.0: @@ -38504,15 +41300,26 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v6bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v6bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v6bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v6bf16: ; GFX10: ; %bb.0: @@ -38651,16 +41458,28 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v8bf16: ; GFX10: ; %bb.0: @@ -38900,20 +41719,36 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v10, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v4, v13, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v14, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v16, v8, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v16bf16: ; GFX10: ; %bb.0: @@ -39469,32 +42304,60 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX8-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_select_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_select_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 +; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:4 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_cndmask_b32_e32 v14, v17, v15, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e32 v15, v18, v16, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_select_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:4 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v1, v18, v2, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v2, v19, v3, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v3, v20, v4, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v4, v21, v5, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v5, v22, v6, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v6, v23, v7, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v7, v24, v8, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v8, v25, v9, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v9, v26, v10, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v10, v27, v11, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v11, v28, v12, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v12, v29, v13, vcc +; GFX950-NEXT: v_cndmask_b32_e32 v13, v30, v14, vcc +; GFX950-NEXT: s_waitcnt vmcnt(1) +; GFX950-NEXT: v_cndmask_b32_e32 v14, v31, v15, vcc +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_select_v32bf16: ; GFX10: ; %bb.0: @@ -39604,19 +42467,34 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX8-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: v_readfirstlane_b32 s1, v1 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v1, s2 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, s3 +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: v_readfirstlane_b32 s1, v1 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v3bf16: ; GFX10: ; %bb.0: @@ -39720,18 +42598,32 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_select_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_readfirstlane_b32 s0, v1 -; GFX9-NEXT: v_readfirstlane_b32 s1, v0 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_select_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: v_mov_b32_e32 v1, s3 +; GFX900-NEXT: v_mov_b32_e32 v2, s1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_mov_b32_e32 v1, s2 +; GFX900-NEXT: v_mov_b32_e32 v2, s0 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX900-NEXT: v_readfirstlane_b32 s0, v1 +; GFX900-NEXT: v_readfirstlane_b32 s1, v0 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_select_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: v_mov_b32_e32 v1, s3 +; GFX950-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX950-NEXT: v_mov_b32_e32 v1, s2 +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX950-NEXT: v_readfirstlane_b32 s1, v0 +; GFX950-NEXT: v_readfirstlane_b32 s0, v1 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_select_v4bf16: ; GFX10: ; %bb.0: @@ -39854,34 +42746,66 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX8-NEXT: v_readfirstlane_b32 s1, v2 ; GFX8-NEXT: ; return to shader part epilog ; -; GFX9-LABEL: s_vselect_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: v_mov_b32_e32 v4, s5 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc -; GFX9-NEXT: s_mov_b32 s1, 0x5040100 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: v_perm_b32 v2, v3, v2, s1 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, s0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s1 -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v2 -; GFX9-NEXT: ; return to shader part epilog +; GFX900-LABEL: s_vselect_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_lshr_b32 s4, s1, 16 +; GFX900-NEXT: s_lshr_b32 s5, s3, 16 +; GFX900-NEXT: v_mov_b32_e32 v4, s5 +; GFX900-NEXT: v_mov_b32_e32 v5, s4 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-NEXT: v_mov_b32_e32 v5, s1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX900-NEXT: s_mov_b32 s1, 0x5040100 +; GFX900-NEXT: s_lshr_b32 s3, s0, 16 +; GFX900-NEXT: s_lshr_b32 s4, s2, 16 +; GFX900-NEXT: v_perm_b32 v2, v3, v2, s1 +; GFX900-NEXT: v_mov_b32_e32 v3, s4 +; GFX900-NEXT: v_mov_b32_e32 v4, s3 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX900-NEXT: v_mov_b32_e32 v3, s2 +; GFX900-NEXT: v_mov_b32_e32 v4, s0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s1 +; GFX900-NEXT: v_readfirstlane_b32 s0, v0 +; GFX900-NEXT: v_readfirstlane_b32 s1, v2 +; GFX900-NEXT: ; return to shader part epilog +; +; GFX950-LABEL: s_vselect_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_lshr_b32 s4, s1, 16 +; GFX950-NEXT: s_lshr_b32 s5, s3, 16 +; GFX950-NEXT: v_mov_b32_e32 v4, s5 +; GFX950-NEXT: v_mov_b32_e32 v5, s4 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX950-NEXT: s_lshr_b32 s4, s2, 16 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX950-NEXT: v_mov_b32_e32 v4, s3 +; GFX950-NEXT: v_mov_b32_e32 v5, s1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX950-NEXT: s_mov_b32 s1, 0x5040100 +; GFX950-NEXT: s_lshr_b32 s3, s0, 16 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc +; GFX950-NEXT: v_perm_b32 v2, v3, v2, s1 +; GFX950-NEXT: v_mov_b32_e32 v3, s4 +; GFX950-NEXT: v_mov_b32_e32 v4, s3 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX950-NEXT: v_mov_b32_e32 v3, s2 +; GFX950-NEXT: v_mov_b32_e32 v4, s0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s1 +; GFX950-NEXT: v_readfirstlane_b32 s1, v2 +; GFX950-NEXT: v_readfirstlane_b32 s0, v0 +; GFX950-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_vselect_v4bf16: ; GFX10: ; %bb.0: @@ -40053,26 +42977,48 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc -; GFX9-NEXT: s_mov_b64 vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v3, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v1, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX900-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX900-NEXT: s_mov_b64 vcc, s[4:5] +; GFX900-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[6:7] +; GFX900-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v3, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v1, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v2 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v1 +; GFX950-NEXT: v_cndmask_b32_sdwa v2, v7, v5, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[2:3] +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc +; GFX950-NEXT: s_mov_b64 vcc, s[0:1] +; GFX950-NEXT: v_cndmask_b32_sdwa v3, v6, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_perm_b32 v0, v3, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v2, v1, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v4bf16: ; GFX10: ; %bb.0: @@ -40294,47 +43240,93 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v8bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v14 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v13 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v12 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 -; GFX9-NEXT: v_perm_b32 v3, v7, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v8bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX900-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX900-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX900-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX900-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX900-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v5, v4, s4 +; GFX900-NEXT: v_perm_b32 v3, v7, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v8bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v15, v11, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: v_lshrrev_b32_e32 v11, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v14, v10, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX950-NEXT: v_lshrrev_b32_e32 v10, 16, v13 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v10, v9, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: v_lshrrev_b32_e32 v9, 16, v12 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v12, v8, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0 +; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v8bf16: ; GFX10: ; %bb.0: @@ -40803,85 +43795,171 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX8-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v16bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v10 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v6, 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6 -; GFX9-NEXT: v_and_b32_e32 v8, 1, v13 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v30 -; GFX9-NEXT: v_and_b32_e32 v11, 1, v11 -; GFX9-NEXT: v_and_b32_e32 v9, 1, v9 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v7 -; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 -; GFX9-NEXT: v_and_b32_e32 v13, 1, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9] -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v21 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v28 -; GFX9-NEXT: v_and_b32_e32 v5, 1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 -; GFX9-NEXT: v_and_b32_e32 v15, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v23 -; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v6, v10, v6, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v24 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc -; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 -; GFX9-NEXT: v_perm_b32 v1, v3, v2, s4 -; GFX9-NEXT: v_perm_b32 v2, v5, v4, s4 -; GFX9-NEXT: v_perm_b32 v3, v9, v19, s4 -; GFX9-NEXT: v_perm_b32 v4, v11, v20, s4 -; GFX9-NEXT: v_perm_b32 v5, v12, v14, s4 -; GFX9-NEXT: v_perm_b32 v7, v7, v13, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v8 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v10 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v6 +; GFX900-NEXT: v_and_b32_e32 v6, 1, v12 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v6 +; GFX900-NEXT: v_and_b32_e32 v8, 1, v13 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v30, v22, s[8:9] +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v8 +; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v22 +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v30 +; GFX900-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX900-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX900-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX900-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX900-NEXT: v_and_b32_e32 v13, 1, v14 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v12, v10, s[8:9] +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v21 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v29, v21, s[6:7] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v29 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v28, v20, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v19 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v27 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v21, v12, s[6:7] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v28 +; GFX900-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v9, v22, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX900-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v18 +; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX900-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v27, v21, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v23 +; GFX900-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX900-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v6, v10, v6, s4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e32 v13, v8, v23, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v25 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v15, v8, vcc +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v16 +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v24 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v15, v8, vcc +; GFX900-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX900-NEXT: v_perm_b32 v1, v3, v2, s4 +; GFX900-NEXT: v_perm_b32 v2, v5, v4, s4 +; GFX900-NEXT: v_perm_b32 v3, v9, v19, s4 +; GFX900-NEXT: v_perm_b32 v4, v11, v20, s4 +; GFX900-NEXT: v_perm_b32 v5, v12, v14, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v13, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v31, off, s32 +; GFX950-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX950-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX950-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v22 +; GFX950-NEXT: v_lshrrev_b32_e32 v34, 16, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX950-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX950-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v21 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX950-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX950-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v20 +; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX950-NEXT: v_lshrrev_b32_e32 v39, 16, v19 +; GFX950-NEXT: v_lshrrev_b32_e32 v48, 16, v27 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX950-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX950-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX950-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; GFX950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX950-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; GFX950-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v31 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v15, v18, v32, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX950-NEXT: v_lshrrev_b32_e32 v18, 16, v25 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v18, v17, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: v_lshrrev_b32_e32 v17, 16, v24 +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0 +; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0 +; GFX950-NEXT: v_perm_b32 v4, v9, v8, s0 +; GFX950-NEXT: v_perm_b32 v5, v11, v10, s0 +; GFX950-NEXT: v_perm_b32 v6, v13, v12, s0 +; GFX950-NEXT: v_perm_b32 v7, v15, v14, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v16bf16: ; GFX10: ; %bb.0: @@ -41981,205 +45059,438 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_vselect_v32bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v7 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v9 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v11 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v10 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v13 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v12 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v15 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v14 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v16 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v19 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v18 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v21 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v20 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v23 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v22 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v25 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v24 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v27 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v26 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v29 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v28 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 -; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 -; GFX9-NEXT: v_writelane_b32 v33, s30, 0 -; GFX9-NEXT: v_writelane_b32 v33, s31, 1 -; GFX9-NEXT: v_writelane_b32 v33, s34, 2 -; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_writelane_b32 v33, s35, 3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v30 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35] -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95] -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91] -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v26 -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v24 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] -; GFX9-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x5040100 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v2, v5, s4 -; GFX9-NEXT: v_perm_b32 v2, v4, v7, s4 -; GFX9-NEXT: v_perm_b32 v3, v6, v9, s4 -; GFX9-NEXT: v_perm_b32 v4, v8, v11, s4 -; GFX9-NEXT: v_perm_b32 v5, v10, v13, s4 -; GFX9-NEXT: v_perm_b32 v6, v12, v15, s4 -; GFX9-NEXT: v_perm_b32 v7, v14, v17, s4 -; GFX9-NEXT: v_perm_b32 v8, v16, v19, s4 -; GFX9-NEXT: v_perm_b32 v9, v18, v21, s4 -; GFX9-NEXT: v_perm_b32 v10, v20, v23, s4 -; GFX9-NEXT: v_perm_b32 v11, v22, v25, s4 -; GFX9-NEXT: v_perm_b32 v12, v24, v27, s4 -; GFX9-NEXT: v_perm_b32 v13, v26, v29, s4 -; GFX9-NEXT: v_perm_b32 v14, v28, v32, s4 -; GFX9-NEXT: v_perm_b32 v15, v31, v30, s4 -; GFX9-NEXT: v_readlane_b32 s35, v33, 3 -; GFX9-NEXT: v_readlane_b32 s34, v33, 2 -; GFX9-NEXT: v_readlane_b32 s31, v33, 1 -; GFX9-NEXT: v_readlane_b32 s30, v33, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_vselect_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v3 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v5 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[10:11], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v4 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[12:13], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v7 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v6 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[16:17], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v9 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[18:19], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v8 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v11 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v10 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v13 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v12 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v15 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v14 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v17 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v16 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v19 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v18 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v21 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v20 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v23 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v22 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v25 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v24 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v27 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v26 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v29 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v28 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0 +; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32 +; GFX900-NEXT: v_writelane_b32 v33, s30, 0 +; GFX900-NEXT: v_writelane_b32 v33, s31, 1 +; GFX900-NEXT: v_writelane_b32 v33, s34, 2 +; GFX900-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX900-NEXT: v_writelane_b32 v33, s35, 3 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0 +; GFX900-NEXT: v_and_b32_e32 v0, 1, v30 +; GFX900-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 +; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 +; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 +; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 +; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 +; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 +; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 +; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16 +; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 +; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20 +; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 +; GFX900-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24 +; GFX900-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92 +; GFX900-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 +; GFX900-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96 +; GFX900-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 +; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100 +; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 +; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104 +; GFX900-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40 +; GFX900-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108 +; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44 +; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112 +; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48 +; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116 +; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 +; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 +; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56 +; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124 +; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60 +; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128 +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35] +; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v32 +; GFX900-NEXT: v_lshrrev_b32_e32 v31, 16, v31 +; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31] +; GFX900-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95] +; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93] +; GFX900-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91] +; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89] +; GFX900-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79] +; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v25 +; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77] +; GFX900-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75] +; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v23 +; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73] +; GFX900-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63] +; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61] +; GFX900-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59] +; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57] +; GFX900-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47] +; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45] +; GFX900-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43] +; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41] +; GFX900-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29] +; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27] +; GFX900-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25] +; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23] +; GFX900-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21] +; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19] +; GFX900-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17] +; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15] +; GFX900-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13] +; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11] +; GFX900-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9] +; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX900-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] +; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x5040100 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v2, v5, s4 +; GFX900-NEXT: v_perm_b32 v2, v4, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v6, v9, s4 +; GFX900-NEXT: v_perm_b32 v4, v8, v11, s4 +; GFX900-NEXT: v_perm_b32 v5, v10, v13, s4 +; GFX900-NEXT: v_perm_b32 v6, v12, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v14, v17, s4 +; GFX900-NEXT: v_perm_b32 v8, v16, v19, s4 +; GFX900-NEXT: v_perm_b32 v9, v18, v21, s4 +; GFX900-NEXT: v_perm_b32 v10, v20, v23, s4 +; GFX900-NEXT: v_perm_b32 v11, v22, v25, s4 +; GFX900-NEXT: v_perm_b32 v12, v24, v27, s4 +; GFX900-NEXT: v_perm_b32 v13, v26, v29, s4 +; GFX900-NEXT: v_perm_b32 v14, v28, v32, s4 +; GFX900-NEXT: v_perm_b32 v15, v31, v30, s4 +; GFX900-NEXT: v_readlane_b32 s35, v33, 3 +; GFX900-NEXT: v_readlane_b32 s34, v33, 2 +; GFX900-NEXT: v_readlane_b32 s31, v33, 1 +; GFX900-NEXT: v_readlane_b32 s30, v33, 0 +; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_vselect_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:60 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:124 +; GFX950-NEXT: scratch_load_ushort v33, off, s32 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:64 +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:128 +; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:120 +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:56 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116 +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:52 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:112 +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:48 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:88 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:24 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:92 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:108 +; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:44 +; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:96 +; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:100 +; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:104 +; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:40 +; GFX950-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v29 +; GFX950-NEXT: scratch_load_dword v29, off, s32 offset:84 +; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:20 +; GFX950-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v28 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX950-NEXT: v_and_b32_e32 v27, 1, v27 +; GFX950-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX950-NEXT: v_and_b32_e32 v25, 1, v25 +; GFX950-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX950-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX950-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX950-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX950-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX950-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX950-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX950-NEXT: v_and_b32_e32 v17, 1, v17 +; GFX950-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX950-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX950-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX950-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX950-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX950-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX950-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX950-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX950-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX950-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX950-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX950-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX950-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshrrev_b32_e32 v46, 16, v31 +; GFX950-NEXT: s_waitcnt vmcnt(23) +; GFX950-NEXT: v_lshrrev_b32_e32 v47, 16, v32 +; GFX950-NEXT: s_waitcnt vmcnt(22) +; GFX950-NEXT: v_and_b32_e32 v28, 1, v33 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:80 +; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:16 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v28 +; GFX950-NEXT: v_and_b32_e32 v28, 1, v30 +; GFX950-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28 +; GFX950-NEXT: scratch_load_dword v28, off, s32 offset:76 +; GFX950-NEXT: scratch_load_dword v30, off, s32 offset:12 +; GFX950-NEXT: s_waitcnt vmcnt(25) +; GFX950-NEXT: v_lshrrev_b32_e32 v58, 16, v34 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshrrev_b32_e32 v59, 16, v35 +; GFX950-NEXT: v_cndmask_b32_e64 v34, v35, v34, s[4:5] +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:72 +; GFX950-NEXT: v_cndmask_b32_e64 v58, v59, v58, s[2:3] +; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:8 +; GFX950-NEXT: v_cndmask_b32_e64 v31, v32, v31, s[0:1] +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:68 +; GFX950-NEXT: v_cndmask_b32_e32 v46, v47, v46, vcc +; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:4 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26 +; GFX950-NEXT: s_mov_b32 s0, 0x5040100 +; GFX950-NEXT: s_waitcnt vmcnt(26) +; GFX950-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v37 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v36 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v27 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24 +; GFX950-NEXT: s_waitcnt vmcnt(24) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v39 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v38 +; GFX950-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v25 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v25, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22 +; GFX950-NEXT: s_waitcnt vmcnt(22) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v49 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v48 +; GFX950-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v23, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20 +; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v55 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v54 +; GFX950-NEXT: v_cndmask_b32_e32 v20, v54, v55, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX950-NEXT: s_waitcnt vmcnt(10) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v44 +; GFX950-NEXT: v_cndmask_b32_e32 v18, v44, v45, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e32 v19, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v43 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v42 +; GFX950-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e32 v17, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v41 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v40 +; GFX950-NEXT: v_cndmask_b32_e32 v14, v40, v41, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: v_cndmask_b32_e32 v15, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v53 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v52 +; GFX950-NEXT: v_cndmask_b32_e32 v12, v52, v53, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v13, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v51 +; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v50 +; GFX950-NEXT: v_cndmask_b32_e32 v10, v50, v51, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v11, v37, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX950-NEXT: s_waitcnt vmcnt(8) +; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v56 +; GFX950-NEXT: v_cndmask_b32_e32 v8, v29, v56, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v9, v29, v36, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v57 +; GFX950-NEXT: v_cndmask_b32_e32 v6, v33, v57, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v33 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v7, v33, v29, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX950-NEXT: s_waitcnt vmcnt(4) +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; GFX950-NEXT: v_cndmask_b32_e32 v4, v28, v30, vcc +; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_cndmask_b32_e32 v5, v28, v29, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX950-NEXT: s_waitcnt vmcnt(2) +; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v59 +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v35 +; GFX950-NEXT: v_cndmask_b32_e32 v2, v35, v59, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v3, v29, v28, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v47 +; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v32 +; GFX950-NEXT: v_cndmask_b32_e32 v0, v32, v47, vcc +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX950-NEXT: s_nop 0 +; GFX950-NEXT: v_cndmask_b32_e32 v1, v29, v28, vcc +; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0 +; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0 +; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0 +; GFX950-NEXT: v_perm_b32 v3, v7, v6, s0 +; GFX950-NEXT: v_perm_b32 v4, v9, v8, s0 +; GFX950-NEXT: v_perm_b32 v5, v11, v10, s0 +; GFX950-NEXT: v_perm_b32 v6, v13, v12, s0 +; GFX950-NEXT: v_perm_b32 v7, v15, v14, s0 +; GFX950-NEXT: v_perm_b32 v8, v17, v16, s0 +; GFX950-NEXT: v_perm_b32 v9, v19, v18, s0 +; GFX950-NEXT: v_perm_b32 v10, v21, v20, s0 +; GFX950-NEXT: v_perm_b32 v11, v23, v22, s0 +; GFX950-NEXT: v_perm_b32 v12, v25, v24, s0 +; GFX950-NEXT: v_perm_b32 v13, v27, v26, s0 +; GFX950-NEXT: v_perm_b32 v14, v46, v31, s0 +; GFX950-NEXT: v_perm_b32 v15, v58, v34, s0 +; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_vselect_v32bf16: ; GFX10: ; %bb.0: @@ -42769,21 +46080,31 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_bf16: ; GFX10: ; %bb.0: @@ -42912,31 +46233,45 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_fma_f32 v3, v5, v4, v3 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_fma_f32 v3, v5, v4, v3 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v5, v4 +; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v2bf16: ; GFX10: ; %bb.0: @@ -43118,41 +46453,60 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX9-NEXT: v_fma_f32 v3, v6, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX900-NEXT: v_fma_f32 v3, v6, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, s0 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v6, v5 +; GFX950-NEXT: v_fmac_f32_e32 v4, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v4, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v3bf16: ; GFX10: ; %bb.0: @@ -43394,50 +46748,73 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fma_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX9-NEXT: v_fma_f32 v6, v8, v7, v6 -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX9-NEXT: v_fma_f32 v3, v7, v5, v3 -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fma_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX900-NEXT: v_fma_f32 v6, v8, v7, v6 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX900-NEXT: v_fma_f32 v3, v7, v5, v3 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_fmac_f32_e32 v6, v8, v7 +; GFX950-NEXT: v_fmac_f32_e32 v5, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX950-NEXT: v_fmac_f32_e32 v1, v7, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v3, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v3, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v5, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fma_v4bf16: ; GFX10: ; %bb.0: @@ -43640,28 +47017,41 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_bf16: ; GFX10: ; %bb.0: @@ -43839,45 +47229,65 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_v2bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v4, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_add3_u32 v4, v4, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_v2bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_add3_u32 v4, v4, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_v2bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v4, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v4 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_v2bf16: ; GFX10: ; %bb.0: @@ -44145,62 +47555,90 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX8-NEXT: v_alignbit_b32 v0, v0, v3, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_v3bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_alignbit_b32 v1, s4, v1, 16 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_v3bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_alignbit_b32 v1, s4, v1, 16 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_v3bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_v3bf16: ; GFX10: ; %bb.0: @@ -44560,78 +47998,113 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: v_fmuladd_v4bf16: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX9-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v5 -; GFX9-NEXT: v_add_f32_e32 v6, v6, v7 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_add3_u32 v7, v7, v6, s4 -; GFX9-NEXT: v_or_b32_e32 v8, 0x400000, v6 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 -; GFX9-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; GFX9-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX9-NEXT: v_add_f32_e32 v3, v3, v5 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_add3_u32 v5, v5, v3, s4 -; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 -; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v0 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc -; GFX9-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-NEXT: v_perm_b32 v0, v0, v3, s4 -; GFX9-NEXT: v_perm_b32 v1, v1, v6, s4 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX900-LABEL: v_fmuladd_v4bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX900-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; GFX900-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v7, v6, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_add3_u32 v7, v7, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v7, vcc +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX900-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX900-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX900-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX900-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v5, v3, 16, 1 +; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_add3_u32 v5, v5, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v5, v7, vcc +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX900-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX900-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_v4bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v5 +; GFX950-NEXT: v_mul_f32_e32 v6, v7, v6 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX950-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_mul_f32_e32 v3, v5, v3 +; GFX950-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v6, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v4 +; GFX950-NEXT: v_add_f32_e32 v6, v6, v7 +; GFX950-NEXT: v_add_f32_e32 v3, v3, v5 +; GFX950-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX950-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fmuladd_v4bf16: ; GFX10: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 7eb7d72..006fe51 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -766,10 +766,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr10, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr15, 0, $vgpr14, 0, 1, 0, 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir index 253e7e2..0e5ef3c 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-indirect-branch.mir @@ -68,7 +68,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -149,7 +149,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir index 474ba71..a25c52f 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-no-terminators.mir @@ -69,7 +69,7 @@ body: | ; CHECK-NEXT: successors: %bb.3(0x04000000), %bb.7(0x7c000000) ; CHECK-NEXT: liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:VRegOrLds_32 */, def renamable $sgpr4 + ; CHECK-NEXT: INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 ; CHECK-NEXT: S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.3, implicit killed $scc ; CHECK-NEXT: {{ $}} @@ -151,7 +151,7 @@ body: | successors: %bb.3(0x04000000), %bb.2(0x7c000000) liveins: $vcc_hi, $vcc_lo, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $sgpr38, $sgpr39, $sgpr40, $sgpr41, $sgpr42, $sgpr43, $sgpr44, $sgpr45, $sgpr46, $sgpr47, $sgpr48, $sgpr49, $sgpr50, $sgpr51, $sgpr52, $sgpr53, $sgpr54, $sgpr55, $sgpr56, $sgpr57, $sgpr58, $sgpr59, $sgpr60, $sgpr61, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $vgpr0, $vgpr1 - INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2097162 /* regdef:SReg_32 */, def renamable $sgpr4 + INLINEASM &"v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64\0A v_nop_e64", 1 /* sideeffect attdialect */, 2359306 /* regdef:SReg_32 */, def renamable $sgpr4 S_CMP_LG_U32 killed renamable $sgpr4, 0, implicit-def $scc S_CBRANCH_SCC1 %bb.2, implicit killed $scc diff --git a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir index 4404f1a..ac8ef48 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/coalescer-early-clobber-subreg.mir @@ -20,10 +20,10 @@ body: | ; CHECK-LABEL: name: foo1 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -41,10 +41,10 @@ body: | ; CHECK-LABEL: name: foo2 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %1:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %0 %2.sub1:vreg_64 = COPY killed %1 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -62,10 +62,10 @@ body: | ; CHECK-LABEL: name: foo3 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VS_32 */, def undef %2.sub0, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) @@ -83,10 +83,10 @@ body: | ; CHECK-LABEL: name: foo4 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VS_32 */, def undef early-clobber %2.sub1, 1835018 /* regdef:VS_32 */, def undef %2.sub0 + ; CHECK-NEXT: INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def undef early-clobber %2.sub1, 2228234 /* regdef:VGPR_32 */, def undef %2.sub0 ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) ; CHECK-NEXT: S_ENDPGM 0 - INLINEASM &"", 0 /* attdialect */, 1835019 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 1835018 /* regdef:VGPR_32 */, def %1:vgpr_32 + INLINEASM &"", 0 /* attdialect */, 2228235 /* regdef-ec:VGPR_32 */, def early-clobber %0:vgpr_32, 2228234 /* regdef:VGPR_32 */, def %1:vgpr_32 undef %2.sub0:vreg_64 = COPY killed %1 %2.sub1:vreg_64 = COPY killed %0 FLAT_STORE_DWORDX2 killed $vgpr0_vgpr1, killed %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 4cb0d2d..e6c38d2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1046,10 +1038,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2667,28 +2659,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -2699,7 +2684,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -3238,10 +3222,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll new file mode 100644 index 0000000..01ebe7d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll @@ -0,0 +1,298 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s + +/* TODO: Support safe bf16 fdiv lowering. +define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) { + %fdiv = fdiv bfloat %x, %y + ret bfloat %fdiv +} +*/ + +define bfloat @v_rcp_bf16(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv bfloat 1.0, %x + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_abs(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_abs: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, |v0.l| +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_abs: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, |v0| +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call bfloat @llvm.fabs.bf16(bfloat %x) + %fdiv = fdiv bfloat 1.0, %fabs + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_afn(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_afn: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_afn: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv afn bfloat 1.0, %x + ret bfloat %fdiv +} + +define bfloat @v_rcp_bf16_neg(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rcp_bf16_neg: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rcp_bf16_neg: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fdiv = fdiv bfloat -1.0, %x + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_neg(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat -1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l +; GFX1250-TRUE16-NEXT: v_nop +; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1) +; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + %r = insertelement <2 x bfloat> zeroinitializer, bfloat %x, i32 0 + %r2 = insertelement <2 x bfloat> %r, bfloat %fdiv, i32 1 + ret <2 x bfloat> %r2 +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract0: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv contract bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv bfloat 1.0, %sqrt + ret bfloat %fdiv +} + +; TODO: Support lowering to v_rsq_bf16. +define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) { +; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_neg_rsq_bf16_missing_contract1: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x) + %fdiv = fdiv bfloat -1.0, %sqrt + ret bfloat %fdiv +} + +define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) { +; GFX1250-TRUE16-LABEL: v_rsq_v2bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_rsq_v2bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) + %fdiv = fdiv contract <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %sqrt + ret <2 x bfloat> %fdiv +} + +define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) { +; GFX1250-TRUE16-LABEL: v_neg_rsq_v2bf16: +; GFX1250-TRUE16: ; %bb.0: +; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h +; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l +; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, -v0.h +; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l +; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16: +; GFX1250-FAKE16: ; %bb.0: +; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1 +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0 +; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1 +; GFX1250-FAKE16-NEXT: v_nop +; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) + %fdiv = fdiv contract <2 x bfloat> <bfloat -1.0, bfloat -1.0>, %sqrt + ret <2 x bfloat> %fdiv +} diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir new file mode 100644 index 0000000..e5955ad --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi-gfx1250.mir @@ -0,0 +1,43 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=si-fold-operands -stop-after=prologepilog -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: test_fold_fi_scratch_load_vgpr +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +body: | + bb.0.entry: + ; GCN-LABEL: name: test_fold_fi_scratch_load_vgpr + ; GCN: renamable $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + S_ENDPGM 0, implicit %1 + +... + +# SS form of the SCRATCH_LOAD_DWORD does not support offset scaling + +--- +name: test_no_fold_fi_scratch_load_vgpr_scale_offset +tracksRegLiveness: true +machineFunctionInfo: + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + stackPtrOffsetReg: $sgpr32 +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 } +body: | + bb.0.entry: + ; GCN-LABEL: name: test_no_fold_fi_scratch_load_vgpr_scale_offset + ; GCN: renamable $vgpr0 = V_MOV_B32_e32 $sgpr32, implicit $exec + ; GCN-NEXT: renamable $vgpr0 = SCRATCH_LOAD_DWORD killed renamable $vgpr0, 4, 2048, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = SCRATCH_LOAD_DWORD %0:vgpr_32, 4, 2048, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) + S_ENDPGM 0, implicit %1 + +... diff --git a/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll new file mode 100644 index 0000000..b68786b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: basic_test + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[C]] + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY1]], [[C1]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s32), [[SELECT1]](s32), 1, 1, 1, 0 + ; CHECK-NEXT: $vgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: unused_active + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 14 + ; CHECK-NEXT: $vgpr0 = COPY [[C]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + ret i32 14 +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { + ; CHECK-LABEL: name: multiple_blocks + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]] + ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s1), [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), [[ICMP]](s1) + ; CHECK-NEXT: G_BRCOND [[INT]](s1), %bb.2 + ; CHECK-NEXT: G_BR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.then: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3.if.end: + ; CHECK-NEXT: [[PHI:%[0-9]+]]:_(s32) = G_PHI [[COPY1]](s32), %bb.1, [[ADD]](s32), %bb.2 + ; CHECK-NEXT: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INT1]](s32) + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[COPY]], [[PHI]] + ; CHECK-NEXT: $vgpr0 = COPY [[SELECT]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0 + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { + ; CHECK-LABEL: name: ret_64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK-NEXT: [[AMDGPU_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:_(s1) = G_AMDGPU_WHOLE_WAVE_FUNC_SETUP + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 5 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV]], [[C]] + ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), [[MV1]], [[C1]] + ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.update.dpp), [[SELECT]](s64), [[SELECT1]](s64), 1, 1, 1, 0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[INTRINSIC_CONVERGENT]](s64) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: G_AMDGPU_WHOLE_WAVE_FUNC_RETURN [[AMDGPU_WHOLE_WAVE_FUNC_SETUP]](s1), implicit $vgpr0, implicit $vgpr1 + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll new file mode 100644 index 0000000..3450d63 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/isel-whole-wave-functions.ll @@ -0,0 +1,191 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=DAGISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -stop-after=finalize-isel < %s | FileCheck --check-prefix=GISEL %s + +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: basic_test + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: basic_test + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY2]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY3]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_1]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: unused_active + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 14, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: unused_active + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 14 + ; GISEL-NEXT: $vgpr0 = COPY [[S_MOV_B32_]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ret i32 14 +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { + ; DAGISEL-LABEL: name: multiple_blocks + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]] + ; DAGISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[COPY1]], [[COPY]], implicit $exec + ; DAGISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; DAGISEL-NEXT: S_BRANCH %bb.1 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: bb.1.if.then: + ; DAGISEL-NEXT: successors: %bb.2(0x80000000) + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[COPY]], 0, implicit $exec + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: bb.2.if.end: + ; DAGISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, [[V_ADD_U32_e64_]], %bb.1 + ; DAGISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]] + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY1]], [[COPY3]], implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + ; + ; GISEL-LABEL: name: multiple_blocks + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GISEL-NEXT: [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec + ; GISEL-NEXT: S_BRANCH %bb.2 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: bb.2.if.then: + ; GISEL-NEXT: successors: %bb.3(0x80000000) + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: bb.3.if.end: + ; GISEL-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]], %bb.1, [[V_ADD_U32_e64_]], %bb.2 + ; GISEL-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[PHI]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0 + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { + ; DAGISEL-LABEL: name: ret_64 + ; DAGISEL: bb.0 (%ir-block.0): + ; DAGISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; DAGISEL-NEXT: {{ $}} + ; DAGISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; DAGISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; DAGISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; DAGISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; DAGISEL-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; DAGISEL-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; DAGISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; DAGISEL-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[SI_WHOLE_WAVE_FUNC_SETUP]] + ; DAGISEL-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; DAGISEL-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY5]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; DAGISEL-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 5 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_1]], 0, killed [[COPY6]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[S_MOV_B32_]], 0, killed [[COPY7]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; DAGISEL-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 3 + ; DAGISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[S_MOV_B32_2]], 0, killed [[COPY8]], [[COPY4]], implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], killed [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], killed [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec + ; DAGISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; DAGISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]] + ; DAGISEL-NEXT: [[DEF4:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; DAGISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1 + ; + ; GISEL-LABEL: name: ret_64 + ; GISEL: bb.1 (%ir-block.0): + ; GISEL-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GISEL-NEXT: {{ $}} + ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GISEL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GISEL-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; GISEL-NEXT: [[SI_WHOLE_WAVE_FUNC_SETUP:%[0-9]+]]:sreg_32_xm0_xexec = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 5, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_]], 0, [[COPY]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_1]], 0, [[COPY1]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_2:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_2]], 0, [[COPY2]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_CNDMASK_B32_e64_3:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[V_MOV_B32_e32_3]], 0, [[COPY3]], [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_]], [[V_CNDMASK_B32_e64_2]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_CNDMASK_B32_e64_1]], [[V_CNDMASK_B32_e64_3]], 1, 1, 1, 0, implicit $exec + ; GISEL-NEXT: $vgpr0 = COPY [[V_MOV_B32_dpp]] + ; GISEL-NEXT: $vgpr1 = COPY [[V_MOV_B32_dpp1]] + ; GISEL-NEXT: SI_WHOLE_WAVE_FUNC_RETURN [[SI_WHOLE_WAVE_FUNC_SETUP]], implicit $vgpr0, implicit $vgpr1 + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll index b77b2f7..1ec4f25 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.alignbyte.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s @@ -19,6 +21,30 @@ define amdgpu_kernel void @v_alignbyte_b32(ptr addrspace(1) %out, i32 %src1, i32 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_alignbyte_b32 v1, s0, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_alignbyte_b32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_alignbyte_b32 v0, s0, s1, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 @@ -73,6 +99,41 @@ define amdgpu_kernel void @v_alignbyte_b32_2(ptr addrspace(1) %out, ptr addrspac ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; +; GFX9-LABEL: v_alignbyte_b32_2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_alignbyte_b32 v1, v1, v2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: v_alignbyte_b32_2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: s_load_dword s2, s[4:5], 0x3c +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbyte_b32 v0, v1, v0, s2 +; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: s_endpgm +; ; GFX11-TRUE16-LABEL: v_alignbyte_b32_2: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll index 25889de..9565314 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll @@ -9,6 +9,172 @@ declare half @llvm.amdgcn.cvt.f16.fp8(i32, i32) declare <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16) declare <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16) +define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e32 v0.l, v0 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte0: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 0) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte1: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:1 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 1) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte2: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:2 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 2) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 +; GFX1250-SDAG-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3) + %ret = fpext half %cvt to float + ret float %ret +} + +define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) { +; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-SDAG-REAL16: ; %bb.0: +; GFX1250-SDAG-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.h, v0 byte_sel:3 +; GFX1250-SDAG-REAL16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX1250-SDAG-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-GISEL-REAL16: ; %bb.0: +; GFX1250-GISEL-REAL16-NEXT: v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3 +; GFX1250-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-REAL16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 +; GFX1250-GISEL-REAL16-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi: +; GFX1250-GISEL-FAKE16: ; %bb.0: +; GFX1250-GISEL-FAKE16-NEXT: v_cvt_f16_bf8_e64 v0, v0 byte_sel:3 +; GFX1250-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 +; GFX1250-GISEL-FAKE16-NEXT: ; return to shader part epilog + %cvt = tail call half @llvm.amdgcn.cvt.f16.bf8(i32 %a, i32 3) + %ins.0 = insertelement <2 x half> undef, half 0.0, i32 0 + %ins.1 = insertelement <2 x half> %ins.0, half %cvt, i32 1 + %ret = bitcast <2 x half> %ins.1 to float + ret float %ret +} + define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) { ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte0: ; GFX1250-SDAG-REAL16: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll index 2f5ff90..9149ed5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll @@ -304,6 +304,556 @@ bb: ret void } +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 2, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 0, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 0, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 1, <16 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v12i32(i32 1, <16 x i32> %A, i32 3, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v8i32(i32 1, <16 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 2, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 2, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 2, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v16i32(i32 3, <12 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v12i32(i32 3, <12 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v12i32.v8i32(i32 3, <12 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: global_store_b128 v[32:33], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v16i32(i32 4, <8 x i32> %A, i32 1, <16 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 2, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GFX1250-NEXT: global_store_b128 v[28:29], v[20:23], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[28:29], v[20:23], off +; GISEL-NEXT: global_store_b128 v[28:29], v[24:27], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v12i32(i32 4, <8 x i32> %A, i32 4, <12 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v8i32.v8i32(i32 4, <8 x i32> %A, i32 4, <8 x i32> %B, i16 0, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb @@ -815,6 +1365,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) @@ -824,6 +1375,7 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) + declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll index fe8358f..12ea314 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -1342,6 +1342,110 @@ bb: ret void } +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34 +; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 +; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s1, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 1.0, float 2.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 +; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 +; GFX1250-NEXT: v_mov_b32_e32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 0, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: ; GFX1250: ; %bb.0: ; %bb @@ -2227,6 +2331,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll index 9802144a..bf8308b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll @@ -1126,6 +1126,72 @@ bb: ret void } +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 1, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 3, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32 0, <16 x i32> %A, i32 0, <16 x i32> %B, i16 4, <8 x float> %C) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC: ; GFX1250: ; %bb.0: ; %bb @@ -1967,6 +2033,7 @@ declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.f8f6f4.v8f32.v16i32.v16i32(i32, <16 x i32>, i32, <16 x i32>, i16, <8 x float>) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll new file mode 100644 index 0000000..ced96ee --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.cos.bf16(bfloat) #0 + +define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 { +; GCN-LABEL: cos_bf16_constant_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_cos_bf16_e32 v0, 0x3f23 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %cos = call bfloat @llvm.cos.bf16(bfloat 4.0) #0 + store bfloat %cos, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 { +; GCN-LABEL: cos_bf16_constant_100: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_cos_bf16_e32 v0, 0x417f +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %cos = call bfloat @llvm.cos.bf16(bfloat 100.0) #0 + store bfloat %cos, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 978f223..8c1e166 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -5213,121 +5213,15 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 { } define float @v_exp_f32_undef() { -; VI-SDAG-LABEL: v_exp_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000 -; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_exp_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sub_f32_e64 v0, s4, 0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 -; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_exp_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; GFX900-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_exp_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; SI-SDAG-LABEL: v_exp_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; SI-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp_f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-GISEL-LABEL: v_exp_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp_f32_undef: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index 70c3787..edc505b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -5291,121 +5291,15 @@ define float @v_exp10_f32_dynamic_mode(float %in) #1 { } define float @v_exp10_f32_undef() { -; VI-SDAG-LABEL: v_exp10_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_rndne_f32_e32 v0, 0 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7fc00000 -; VI-SDAG-NEXT: v_add_f32_e64 v1, -v0, s4 -; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 -; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v0, v0 -; VI-SDAG-NEXT: v_ldexp_f32 v0, v1, v0 -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_exp10_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_sub_f32_e64 v0, s4, 0 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549000, v0 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v2, v0 -; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_exp10_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; GFX900-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_exp10_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; GFX900-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; SI-SDAG-LABEL: v_exp10_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0xffc00000 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v0, v1 -; SI-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 -; SI-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0 -; SI-SDAG-NEXT: v_rndne_f32_e32 v1, 0x7fc00000 -; SI-SDAG-NEXT: v_sub_f32_e32 v2, 0x7fc00000, v1 -; SI-SDAG-NEXT: v_add_f32_e32 v0, v2, v0 -; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0 -; SI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v1 -; SI-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp10_f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ; -; SI-GISEL-LABEL: v_exp10_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v0 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37 -; SI-GISEL-NEXT: v_fma_f32 v0, s4, v2, v0 -; SI-GISEL-NEXT: v_rndne_f32_e32 v2, v1 -; SI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v2 -; SI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0 -; SI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2 -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; SI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_exp10_f32_undef: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp10_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 15bcab9..e71ea50 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -2783,56 +2783,10 @@ define float @v_exp2_f32_dynamic_mode(float %in) #1 { } define float @v_exp2_f32_undef() { -; GCN-SDAG-LABEL: v_exp2_f32_undef: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-SDAG-NEXT: v_exp_f32_e32 v0, 0x7fc00000 -; GCN-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_exp2_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; SI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; SI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_not_b32_e32 v1, 63 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_exp2_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; VI-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; VI-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_not_b32_e32 v1, 63 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_exp2_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, s4, v1 -; GFX900-GISEL-NEXT: v_add_f32_e64 v2, s4, 0 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_not_b32_e32 v1, 63 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: v_exp2_f32_undef: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_exp2_f32_undef: ; R600: ; %bb.0: @@ -4076,3 +4030,4 @@ attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN-GISEL: {{.*}} +; GCN-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 5634df5..38d1b47 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -5590,162 +5590,15 @@ define float @v_log_f32_dynamic_mode(float %in) #1 { } define float @v_log_f32_undef() { -; SI-SDAG-LABEL: v_log_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 -; SI-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf -; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; SI-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-SDAG-LABEL: v_log_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v0, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3805fdf4, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3f317000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_log_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3805fdf4, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3805fdf4, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317000, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317000, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_log_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3f317217 -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3377d1cf -; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_log_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3f317217 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3377d1cf -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3f317217, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x41b17218 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log_f32_undef: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_log_f32_undef: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-GISEL-LABEL: v_log_f32_undef: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_log_f32_undef: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 8d1a231..058933f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -5590,162 +5590,15 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 { } define float @v_log10_f32_undef() { -; SI-SDAG-LABEL: v_log10_f32_undef: -; SI-SDAG: ; %bb.0: -; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; SI-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a -; SI-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf -; SI-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; SI-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; SI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; SI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; SI-GISEL-LABEL: v_log10_f32_undef: -; SI-GISEL: ; %bb.0: -; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; SI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; SI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; SI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a -; SI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf -; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; SI-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; SI-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; SI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; SI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; SI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; SI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; SI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; SI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; VI-SDAG-LABEL: v_log10_f32_undef: -; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SDAG-NEXT: v_log_f32_e32 v0, s4 -; VI-SDAG-NEXT: s_mov_b32 s4, 0x7f800000 -; VI-SDAG-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-SDAG-NEXT: v_sub_f32_e32 v3, v0, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x369a84fb, v1 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v3 -; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3e9a2000, v3 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-SDAG-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4 -; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; VI-GISEL-LABEL: v_log10_f32_undef: -; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-GISEL-NEXT: v_log_f32_e32 v0, v0 -; VI-GISEL-NEXT: v_and_b32_e32 v1, 0xfffff000, v0 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x369a84fb, v1 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x369a84fb, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a2000, v2 -; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a2000, v1 -; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; VI-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-SDAG-LABEL: v_log10_f32_undef: -; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX900-SDAG-NEXT: s_mov_b32 s4, 0x3e9a209a -; GFX900-SDAG-NEXT: s_mov_b32 s5, 0x3284fbcf -; GFX900-SDAG-NEXT: s_mov_b32 s6, 0x7f800000 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s4, -v1 -; GFX900-SDAG-NEXT: v_fma_f32 v2, v0, s5, v2 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s6 -; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX900-GISEL-LABEL: v_log10_f32_undef: -; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3e9a209a -; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x3284fbcf -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, 0x3e9a209a, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v1, -v2 -; GFX900-GISEL-NEXT: v_fma_f32 v1, v0, v3, v1 -; GFX900-GISEL-NEXT: v_add_f32_e32 v1, v2, v1 -; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, v2 -; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] -; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x411a209b -; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX900-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log10_f32_undef: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0| -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-SDAG-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 -; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_log10_f32_undef: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-GISEL-LABEL: v_log10_f32_undef: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_log10_f32_undef: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 7ca72bf..4ca612a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -3542,45 +3542,15 @@ define float @v_log2_f32_dynamic_mode(float %in) #1 { } define float @v_log2_f32_undef() { -; GFX689-SDAG-LABEL: v_log2_f32_undef: -; GFX689-SDAG: ; %bb.0: -; GFX689-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-SDAG-NEXT: v_log_f32_e32 v0, s4 -; GFX689-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX689-GISEL-LABEL: v_log2_f32_undef: -; GFX689-GISEL: ; %bb.0: -; GFX689-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX689-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000 -; GFX689-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 -; GFX689-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX689-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX689-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000 -; GFX689-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX689-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX689-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX1100-SDAG-LABEL: v_log2_f32_undef: -; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, s0 -; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX689-LABEL: v_log2_f32_undef: +; GFX689: ; %bb.0: +; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX689-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-GISEL-LABEL: v_log2_f32_undef: -; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_mul_f32_e64 v0, 0x4f800000, s0 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x800000, s0 -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, s0, v0, vcc_lo -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX1100-LABEL: v_log2_f32_undef: +; GFX1100: ; %bb.0: +; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_undef: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 355f77a..af914bd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; SI-NEXT: s_movk_i32 s4, 0xfc01 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 -; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 ; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; SI-NEXT: v_not_b32_e32 v5, v5 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll new file mode 100644 index 0000000..7a355a3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 %s -o - | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.sin.bf16(bfloat) #0 + +define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 { +; GCN-LABEL: sin_bf16_constant_4: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_sin_bf16_e32 v0, 0x3f23 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %sin = call bfloat @llvm.sin.bf16(bfloat 4.0) #0 + store bfloat %sin, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 { +; GCN-LABEL: sin_bf16_constant_100: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GCN-NEXT: v_sin_bf16_e32 v0, 0x417f +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: global_store_b16 v1, v0, s[0:1] +; GCN-NEXT: s_endpgm + %sin = call bfloat @llvm.sin.bf16(bfloat 100.0) #0 + store bfloat %sin, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir new file mode 100644 index 0000000..76e2092 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/load-store-opt-scale-offset.mir @@ -0,0 +1,104 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: merge_global_load_dword_2_no_scale_offset +body: | + bb.0.entry: + + ; GCN-LABEL: name: merge_global_load_dword_2_no_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR [[DEF]], [[DEF1]], 0, 1, implicit $exec :: (load (s64) from `ptr addrspace(1) undef` + 4, align 4, addrspace 1) + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1 + ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[COPY]] + %0:sreg_64_xexec_xnull = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: no_merge_global_load_dword_2_same_scale_offset +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_2_same_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2049, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sreg_64_xexec_xnull = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2049, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2049, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +--- +name: no_merge_global_load_dword_2_different_scale_offset +body: | + bb.0.entry: + + ; GCN-LABEL: name: no_merge_global_load_dword_2_different_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sreg_64_xexec_xnull = IMPLICIT_DEF + ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, addrspace 1) + ; GCN-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[DEF]], [[DEF1]], 4, 2048, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, addrspace 1) + ; GCN-NEXT: S_NOP 0, implicit [[DEF1]], implicit [[GLOBAL_LOAD_DWORD_SADDR]] + %0:sreg_64_xexec_xnull = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1) + %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2048, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1) + S_NOP 0, implicit %1, implicit %2 +... + +# NB: We do not currently support merging SGPR offset and SGPR+Imm offset forms +# of S_LOAD, but the check stays the same: these cannot be merged with different +# scale offsets. +# +# We also do not currently merge flat scratch instructions, although a common +# check in the merge logic that CPol shall not be set for merge to happen. + +--- +name: merge_s_load_x1_x1_imm_no_scale_offset +body: | + bb.0: + ; GCN-LABEL: name: merge_s_load_x1_x1_imm_no_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) +... + +--- +name: no_merge_s_load_x1_x1_imm_same_scale_offset +body: | + bb.0: + ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_same_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 2048 :: (dereferenceable invariant load (s32)) + ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32)) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 2048 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32)) +... + +--- +name: no_merge_s_load_x1_x1_imm_different_scale_offset +body: | + bb.0: + ; GCN-LABEL: name: no_merge_s_load_x1_x1_imm_different_scale_offset + ; GCN: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 4, 2048 :: (dereferenceable invariant load (s32)) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 2048 :: (dereferenceable invariant load (s32)) +... diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll index 047bdde..8281320 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-unoptimized-debug-data.ll @@ -11,11 +11,13 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: #dbg_value(ptr addrspace(5) [[BUF_PTR_VAR]], [[META10:![0-9]+]], !DIExpression(), [[DBG21]]) ; CHECK-NEXT: [[AUX_PTR_VAR:%.*]] = alloca i160, align 32, addrspace(5), !dbg [[DBG22:![0-9]+]] ; CHECK-NEXT: #dbg_value(ptr addrspace(5) [[AUX_PTR_VAR]], [[META12:![0-9]+]], !DIExpression(), [[DBG22]]) -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META13:![0-9]+]], !DIExpression(), [[META23:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i32 0, [[META13:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META23:![0-9]+]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF]], [[META13]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META23]]) ; CHECK-NEXT: [[BUF_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF]] to i160, !dbg [[DBG24:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_INT:%.*]] = shl nuw i160 [[BUF_PTR_INT_RSRC]], 32, !dbg [[DBG24]] ; CHECK-NEXT: store i160 [[BUF_PTR_INT]], ptr addrspace(5) [[BUF_PTR_VAR]], align 32, !dbg [[DBG24]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META15:![0-9]+]], !DIExpression(), [[META25:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i32 0, [[META15:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[META25:![0-9]+]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX]], [[META15]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[META25]]) ; CHECK-NEXT: [[AUX_PTR_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[AUX]] to i160, !dbg [[DBG26:![0-9]+]] ; CHECK-NEXT: [[AUX_PTR_INT:%.*]] = shl nuw i160 [[AUX_PTR_INT_RSRC]], 32, !dbg [[DBG26]] ; CHECK-NEXT: store i160 [[AUX_PTR_INT]], ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG26]] @@ -24,10 +26,12 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128, !dbg [[DBG27]] ; CHECK-NEXT: [[BUF_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP2]] to ptr addrspace(8), !dbg [[DBG27]] ; CHECK-NEXT: [[BUF_PTR_2_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_2]] to i32, !dbg [[DBG27]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META16:![0-9]+]], !DIExpression(), [[DBG27]]) +; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_2_PTR_OFF]], [[META16:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG27]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META16]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG27]]) ; CHECK-NEXT: [[BUF_PTR_3_IDX:%.*]] = mul i32 [[IDX]], 4, !dbg [[DBG28:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_3:%.*]] = add i32 [[BUF_PTR_2_PTR_OFF]], [[BUF_PTR_3_IDX]], !dbg [[DBG28]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META17:![0-9]+]], !DIExpression(), [[DBG28]]) +; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_3]], [[META17:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG28]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]], [[META17]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG28]]) ; CHECK-NEXT: [[BUF_PTR_3_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[BUF_PTR_2_PTR_RSRC]] to i160, !dbg [[DBG29:![0-9]+]] ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i160 [[BUF_PTR_3_INT_RSRC]], 32, !dbg [[DBG29]] ; CHECK-NEXT: [[BUF_PTR_3_INT_OFF:%.*]] = zext i32 [[BUF_PTR_3]] to i160, !dbg [[DBG29]] @@ -38,7 +42,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: [[TMP5:%.*]] = trunc i160 [[TMP4]] to i128, !dbg [[DBG30]] ; CHECK-NEXT: [[BUF_PTR_4_PTR_RSRC:%.*]] = inttoptr i128 [[TMP5]] to ptr addrspace(8), !dbg [[DBG30]] ; CHECK-NEXT: [[BUF_PTR_4_PTR_OFF:%.*]] = trunc i160 [[BUF_PTR_4]] to i32, !dbg [[DBG30]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META18:![0-9]+]], !DIExpression(), [[DBG30]]) +; CHECK-NEXT: #dbg_value(i32 [[BUF_PTR_4_PTR_OFF]], [[META18:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG30]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[BUF_PTR_4_PTR_RSRC]], [[META18]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG30]]) ; CHECK-NEXT: [[RET:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF_PTR_4_PTR_RSRC]], i32 [[BUF_PTR_4_PTR_OFF]], i32 0, i32 0), !dbg [[DBG31:![0-9]+]] ; CHECK-NEXT: #dbg_value(float [[RET]], [[META19:![0-9]+]], !DIExpression(), [[DBG31]]) ; CHECK-NEXT: [[AUX_PTR_2:%.*]] = load i160, ptr addrspace(5) [[AUX_PTR_VAR]], align 32, !dbg [[DBG32:![0-9]+]] @@ -46,7 +51,8 @@ define float @debug_stash_pointer(ptr addrspace(8) %buf, i32 %idx, ptr addrspace ; CHECK-NEXT: [[TMP7:%.*]] = trunc i160 [[TMP6]] to i128, !dbg [[DBG32]] ; CHECK-NEXT: [[AUX_PTR_2_PTR_RSRC:%.*]] = inttoptr i128 [[TMP7]] to ptr addrspace(8), !dbg [[DBG32]] ; CHECK-NEXT: [[AUX_PTR_2_PTR_OFF:%.*]] = trunc i160 [[AUX_PTR_2]] to i32, !dbg [[DBG32]] -; CHECK-NEXT: #dbg_value({ ptr addrspace(8), i32 } poison, [[META20:![0-9]+]], !DIExpression(), [[DBG32]]) +; CHECK-NEXT: #dbg_value(i32 [[AUX_PTR_2_PTR_OFF]], [[META20:![0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 128, 32), [[DBG32]]) +; CHECK-NEXT: #dbg_value(ptr addrspace(8) [[AUX_PTR_2_PTR_RSRC]], [[META20]], !DIExpression(DW_OP_LLVM_fragment, 0, 128), [[DBG32]]) ; CHECK-NEXT: [[BUF_PTR_4_LEGAL:%.*]] = bitcast i160 [[BUF_PTR_4]] to <5 x i32>, !dbg [[DBG33:![0-9]+]] ; CHECK-NEXT: [[BUF_PTR_4_SLICE_0:%.*]] = shufflevector <5 x i32> [[BUF_PTR_4_LEGAL]], <5 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !dbg [[DBG33]] ; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> [[BUF_PTR_4_SLICE_0]], ptr addrspace(8) align 32 [[AUX_PTR_2_PTR_RSRC]], i32 [[AUX_PTR_2_PTR_OFF]], i32 0, i32 0), !dbg [[DBG33]] diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir index 409b1d6..ce67a2e 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards.mir @@ -33,7 +33,7 @@ name: asm_write_vgpr_accvgpr_write_read body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -47,7 +47,7 @@ name: asm_write_vgpr_accvgpr_write_read_partialnop body: | bb.0: - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr0 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr0 S_NOP 0 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... @@ -60,7 +60,7 @@ name: asm_write_vgpr_accvgpr_write_read_otherreg body: | bb.0: liveins: $vgpr0 - INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 1966090 /* regdef:VGPR_32 */, def $vgpr1 + INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def $vgpr1 $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll index ae35d0d..e6bc733 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable-dvgpr.ll @@ -17,6 +17,7 @@ ; CHECK-NEXT: .debug_mode: 0 ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll index 638dc89..310040d 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll @@ -19,6 +19,7 @@ ; CHECK-NEXT: .debug_mode: 0 ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: true ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0x200 diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll index fb6ac2e..c1846c0 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-dvgpr.ll @@ -59,6 +59,7 @@ ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true @@ -113,6 +114,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_gs ; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false @@ -124,6 +126,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_hs ; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false @@ -135,6 +138,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_ps ; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true ; CHECK-NEXT: .scratch_en: false diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll index 15778c8..5c0c366 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.ll @@ -62,6 +62,7 @@ ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0 @@ -118,6 +119,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_gs_main ; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true @@ -130,6 +132,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_hs_main ; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true @@ -142,6 +145,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NEXT: .entry_point: _amdgpu_ps_main ; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll index 644722b..830872a 100644 --- a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.6.ll @@ -62,6 +62,7 @@ ; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main ; CHECK-NEXT: .excp_en: 0 ; CHECK-NEXT: .float_mode: 0xc0 +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .image_op: false ; CHECK-NEXT: .lds_size: 0 @@ -118,6 +119,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NOT: .entry_point: _amdgpu_gs_main ; CHECK-NEXT: .entry_point_symbol: gs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x200 ; CHECK-NEXT: .mem_ordered: true @@ -130,6 +132,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NOT: .entry_point: _amdgpu_hs_main ; CHECK-NEXT: .entry_point_symbol: hs_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0x1000 ; CHECK-NEXT: .mem_ordered: true @@ -142,6 +145,7 @@ ; CHECK-NEXT: .debug_mode: false ; CHECK-NOT: .entry_point: _amdgpu_ps_main ; CHECK-NEXT: .entry_point_symbol: ps_shader +; CHECK-NEXT: .forward_progress: true ; GFX11-NEXT: .ieee_mode: false ; CHECK-NEXT: .lds_size: 0 ; CHECK-NEXT: .mem_ordered: true diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir index c9d0cf3..fef7332 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-failure-overlapping-insert-assert.mir @@ -45,13 +45,13 @@ body: | INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def $agpr0 %14:vgpr_32 = COPY killed $agpr0 - INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 11534346 /* regdef:VReg_512 */, def %7, 10158090 /* regdef:VReg_256 */, def %8, 4784138 /* regdef:VReg_128 */, def %9, 3670026 /* regdef:VReg_96 */, def %10, 3670026 /* regdef:VReg_96 */, def %11 + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 27262986 /* regdef:VReg_512 */, def %7, 13565962 /* regdef:VReg_256 */, def %8, 6094858 /* regdef:VReg_128 */, def %9, 4784138 /* regdef:VReg_96 */, def %10, 4784138 /* regdef:VReg_96 */, def %11 INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 12 /* clobber */, implicit-def dead early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 11534345 /* reguse:VReg_512 */, %7 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 10158089 /* reguse:VReg_256 */, %8 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_128 */, %9 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %10 - INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3670025 /* reguse:VReg_96 */, %11 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 27262985 /* reguse:VReg_512 */, %7 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 13565961 /* reguse:VReg_256 */, %8 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6094857 /* reguse:VReg_128 */, %9 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %10 + INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 4784137 /* reguse:VReg_96 */, %11 $agpr1 = COPY %14 INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 9 /* reguse */, killed $agpr1 SI_RETURN diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 5d0e4bf..8fe68ba 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1084,10 +1076,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -1900,28 +1892,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -1932,7 +1917,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -2471,10 +2455,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll new file mode 100644 index 0000000..b5bb68e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll @@ -0,0 +1,372 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s + +define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b32_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +; 'i32 %idx' is a signed index while SMRD soffset is unsigned, thus it is not selected. + +define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; SDAG-LABEL: s_load_b32_idx32: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_ashr_i32 s3, s2, 31 +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0 +; SDAG-NEXT: s_wait_kmcnt 0x0 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_load_b32_idx32: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_ashr_i32 s3, s2, 31 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 2 +; GISEL-NEXT: s_add_co_u32 s0, s0, s2 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i32 %idx + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; SDAG-LABEL: s_load_b32_idxprom_wrong_stride: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_mov_b32 s3, 0 +; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] +; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0 +; SDAG-NEXT: s_wait_kmcnt 0x0 +; SDAG-NEXT: v_mov_b32_e32 v0, s0 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_load_b32_idxprom_wrong_stride: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_mov_b32 s3, 0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 3 +; GISEL-NEXT: s_add_co_u32 s0, s0, s2 +; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3 +; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b16_idxprom_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b64_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b96_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b128_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b256_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <8 x float> %ret +} + +define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) { +; GCN-LABEL: s_load_b512_idxprom: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: ; return to shader part epilog +entry: + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <16 x float> %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b32_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b32_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxadd + %ret = load float, ptr addrspace(4) %arrayidx, align 4 + ret float %ret +} + +; Note: this is a byte load, there is nothing to scale + +define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b8_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i8, ptr addrspace(4) %p, i64 %idxadd + %ld = load i8, ptr addrspace(4) %arrayidx + %ret.i32 = zext i8 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b16_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxprom + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b16_idxprom_range_ioffset: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %idxadd = add i64 %idxprom, 16 + %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd + %ld = load i16, ptr addrspace(4) %arrayidx, align 2 + %ret.i32 = zext i16 %ld to i32 + %ret = bitcast i32 %ret.i32 to float + ret float %ret +} + +define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b64_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <2 x float> %ret +} + +define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b96_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom + %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <3 x float> %ret +} + +define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b128_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <4 x float> %ret +} + +define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b256_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <8 x float> %ret +} + +define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) { +; GCN-LABEL: s_load_b512_idxprom_range: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset +; GCN-NEXT: s_wait_kmcnt 0x0 +; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5 +; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7 +; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9 +; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11 +; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13 +; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15 +; GCN-NEXT: ; return to shader part epilog +entry: + %idx = load i32, ptr addrspace(4) %p, align 4, !range !0 + %idxprom = zext i32 %idx to i64 + %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom + %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4 + ret <16 x float> %ret +} + +!0 = !{i32 0, i32 1024} diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index a6b8ea3..6da7d1b 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1 ; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2 ; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 +; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0 ; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 @@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8 ; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10 ; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 +; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir new file mode 100644 index 0000000..93f4891 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions-pei.mir @@ -0,0 +1,448 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=prologepilog -o - %s | FileCheck %s + +--- +name: save_inactive_lanes_non_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: save_inactive_lanes_non_csr_vgpr + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 14, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: save_all_lanes_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: save_all_lanes_csr_vgpr + ; CHECK: liveins: $vgpr40 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr40 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr40 = V_MOV_B32_e32 14, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0 + +... +--- +name: save_csr_sgpr_to_non_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr191 + ; CHECK-LABEL: name: save_csr_sgpr_to_non_csr_vgpr + ; CHECK: liveins: $sgpr20, $vgpr191, $vgpr192 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr192, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192 + ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0 + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr192 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr192 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr192 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr20 = S_MOV_B32 14, implicit $exec + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr192, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: save_csr_sgpr_to_csr_vgpr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr191 + ; CHECK-LABEL: name: save_csr_sgpr_to_csr_vgpr + ; CHECK: liveins: $sgpr20, $vgpr191 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr191, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + ; CHECK-NEXT: $sgpr20 = S_MOV_B32 14, implicit $exec + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + ; CHECK-NEXT: $vgpr191 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr20 = S_MOV_B32 14, implicit $exec + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: vgpr_and_sgpr_csr +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 4 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + hasSpilledSGPRs: true + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + spillPhysVGPRs: + - '$vgpr191' + wwmReservedRegs: + - '$vgpr191' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191 + + ; CHECK-LABEL: name: vgpr_and_sgpr_csr + ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo + +... +--- +name: split_orig_exec +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 4 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + hasSpilledSGPRs: true + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + spillPhysVGPRs: + - '$vgpr191' + wwmReservedRegs: + - '$vgpr191' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr191 + + ; CHECK-LABEL: name: split_orig_exec + ; CHECK: liveins: $sgpr20, $vgpr0, $vgpr1, $vgpr40, $vgpr49 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr49, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0 + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + ; CHECK-NEXT: $sgpr3 = COPY $vcc_lo + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + ; CHECK-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr3, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr49 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr3 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3 + $vgpr191 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr191 + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit-def $vgpr40, implicit-def $sgpr20 + $sgpr3 = COPY $vcc_lo + S_NOP 0, implicit-def $vgpr49, implicit-def $sgpr40 + $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr191, 0 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr3 + +... +--- +name: vgpr_superregs +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + ; CHECK-LABEL: name: vgpr_superregs + ; CHECK: liveins: $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr40, $vgpr41, $vgpr42 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr40, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr41, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr42, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5) + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 14, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42 + ; CHECK-NEXT: $vgpr40 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: $vgpr41 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: $vgpr42 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $sgpr0, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 14, implicit $exec + S_NOP 0, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr40_vgpr41_vgpr42 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: dont_restore_used_vgprs +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr20' } + - { reg: '$vgpr40' } +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + bb.0: + liveins: $vgpr0, $vgpr20, $vgpr40 + + ; CHECK-LABEL: name: dont_restore_used_vgprs + ; CHECK: liveins: $vgpr0, $vgpr20, $vgpr40 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40 + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $sgpr0 + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + renamable $sgpr0 = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + S_NOP 0, implicit $vgpr0, implicit $vgpr20, implicit $vgpr40 + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $sgpr0, implicit killed $vgpr0 + +... +--- +name: multiple_blocks +alignment: 1 +tracksRegLiveness: true +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +tracksDebugUserValues: true +liveins: + - { reg: '$vgpr0' } + - { reg: '$vgpr1' } +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + maxKernArgAlign: 1 + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + returnsVoid: false + occupancy: 16 + sgprForEXECCopy: '$sgpr105' + isWholeWaveFunction: true +body: | + ; CHECK-LABEL: name: multiple_blocks + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vcc_lo = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr0, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 -1 + ; CHECK-NEXT: $sgpr1 = S_MOV_B32 $exec_lo + ; CHECK-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc + ; CHECK-NEXT: renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec + ; CHECK-NEXT: $exec_lo = S_XOR_B32 $vcc_lo, -1, implicit-def $scc + ; CHECK-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: $exec_lo = S_MOV_B32 $vcc_lo + ; CHECK-NEXT: SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0 + bb.0: + successors: %bb.1, %bb.2 + liveins: $vgpr0, $vgpr1 + + renamable $vcc_lo = SI_WHOLE_WAVE_FUNC_SETUP implicit-def dead $exec, implicit $exec + $sgpr1 = S_MOV_B32 $exec_lo + V_CMPX_EQ_U32_nosdst_e64 $vgpr0, $vgpr1, implicit-def $exec, implicit $exec + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + + renamable $vgpr1 = V_ADD_U32_e64 $vgpr0, $vgpr1, 0, implicit $exec + + bb.2: + liveins: $vcc_lo, $sgpr1, $vgpr0, $vgpr1 + + $exec_lo = S_OR_B32 $exec_lo, killed renamable $sgpr1, implicit-def $scc + renamable $vgpr0 = V_CNDMASK_B32_e64 0, $vgpr1, 0, $vgpr0, $vcc_lo, implicit $exec + SI_WHOLE_WAVE_FUNC_RETURN killed renamable $vcc_lo, implicit $vgpr0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll new file mode 100644 index 0000000..53d0292 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -0,0 +1,2414 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=DAGISEL64 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck --check-prefix=GISEL64 %s + +; Make sure the i1 %active is passed through EXEC. +; The EXEC mask should be set to -1 for the duration of the function +; and restored to its original value in the epilogue. +; We will also need to restore the inactive lanes for any allocated VGPRs. +define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: basic_test: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: basic_test: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: basic_test: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: basic_test: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if there's only one use for %active. +define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: single_use_of_active: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: single_use_of_active: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc_lo +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: single_use_of_active: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: single_use_of_active: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 17, v1, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %y = select i1 %active, i32 %b, i32 17 + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %a, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Make sure we don't crash if %active is not used at all. +define amdgpu_gfx_whole_wave i32 @unused_active(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: unused_active: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_mov_b32_e32 v0, 14 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: unused_active: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: v_mov_b32_e32 v0, 14 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: unused_active: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_mov_b32_e32 v0, 14 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: unused_active: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: v_mov_b32_e32 v0, 14 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + ret i32 14 +} + +; For any used VGPRs (including those used for SGPR spills), we need to restore the inactive lanes. +; For CSR VGPRs, we need to restore all lanes. +define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: csr: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: v_writelane_b32 v2, s20, 0 +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber non-CSR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; DAGISEL-NEXT: v_readlane_b32 s20, v2, 0 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_wait_alu 0xf1ff +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: csr: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: v_writelane_b32 v2, s20, 0 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber non-CSR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 3, v1 +; GISEL-NEXT: v_readlane_b32 s20, v2, 0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xf1ff +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: csr: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: v_writelane_b32 v2, s20, 0 +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber non-CSR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; DAGISEL64-NEXT: v_readlane_b32 s20, v2, 0 +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_wait_alu 0xf1ff +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: csr: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:16 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s32 offset:12 ; 4-byte Folded Spill +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: v_writelane_b32 v2, s20, 0 +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber non-CSR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: scratch_load_b32 v40, off, s32 offset:12 ; 4-byte Folded Reload +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc +; GISEL64-NEXT: v_readlane_b32 s20, v2, 0 +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GISEL64-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:16 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_wait_alu 0xf1ff +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i32 %a, i32 5 + %y = select i1 %active, i32 %b, i32 3 + call void asm sideeffect "; clobber CSR", "~{v40},~{s48}"() + call void asm sideeffect "; clobber non-CSR", "~{v49},~{s20}"() + %ret = call i32 @llvm.amdgcn.update.dpp.i32(i32 %x, i32 %y, i32 1, i32 1, i32 1, i1 false) + ret i32 %ret +} + +; Save and restore all lanes of v40. +define amdgpu_gfx_whole_wave void @csr_vgpr_only(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: csr_vgpr_only: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_or_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR VGPR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: csr_vgpr_only: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_or_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR VGPR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: csr_vgpr_only: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR VGPR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: csr_vgpr_only: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR VGPR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: scratch_load_b32 v40, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber CSR VGPR", "~{v40}"() + ret void +} + +define amdgpu_gfx_whole_wave void @sgpr_spill_only(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: sgpr_spill_only: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_writelane_b32 v0, s68, 0 +; DAGISEL-NEXT: ;;#ASMSTART +; DAGISEL-NEXT: ; clobber CSR SGPR +; DAGISEL-NEXT: ;;#ASMEND +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_readlane_b32 s68, v0, 0 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: sgpr_spill_only: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: v_writelane_b32 v0, s68, 0 +; GISEL-NEXT: ;;#ASMSTART +; GISEL-NEXT: ; clobber CSR SGPR +; GISEL-NEXT: ;;#ASMEND +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_readlane_b32 s68, v0, 0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: sgpr_spill_only: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_writelane_b32 v0, s68, 0 +; DAGISEL64-NEXT: ;;#ASMSTART +; DAGISEL64-NEXT: ; clobber CSR SGPR +; DAGISEL64-NEXT: ;;#ASMEND +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_readlane_b32 s68, v0, 0 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: sgpr_spill_only: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: v_writelane_b32 v0, s68, 0 +; GISEL64-NEXT: ;;#ASMSTART +; GISEL64-NEXT: ; clobber CSR SGPR +; GISEL64-NEXT: ;;#ASMEND +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_readlane_b32 s68, v0, 0 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + call void asm sideeffect "; clobber CSR SGPR", "~{s68}"() + ret void +} + +define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) { +; DAGISEL-LABEL: multiple_blocks: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL-NEXT: s_mov_b32 s1, exec_lo +; DAGISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; DAGISEL-NEXT: ; %bb.1: ; %if.then +; DAGISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; DAGISEL-NEXT: ; %bb.2: ; %if.end +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: multiple_blocks: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s1, exec_lo +; GISEL-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; GISEL-NEXT: ; %bb.1: ; %if.then +; GISEL-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; GISEL-NEXT: ; %bb.2: ; %if.end +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: multiple_blocks: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL64-NEXT: s_mov_b64 s[2:3], exec +; DAGISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; DAGISEL64-NEXT: ; %bb.1: ; %if.then +; DAGISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; DAGISEL64-NEXT: ; %bb.2: ; %if.end +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_or_b64 exec, exec, s[2:3] +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: multiple_blocks: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL64-NEXT: s_mov_b64 s[2:3], exec +; GISEL64-NEXT: v_cmpx_eq_u32_e64 v0, v1 +; GISEL64-NEXT: ; %bb.1: ; %if.then +; GISEL64-NEXT: v_add_nc_u32_e32 v1, v0, v1 +; GISEL64-NEXT: ; %bb.2: ; %if.end +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %c = icmp eq i32 %a, %b + br i1 %c, label %if.then, label %if.end + +if.then: ; preds = %0 + %d = add i32 %a, %b + br label %if.end + +if.end: + %f = phi i32 [ %d, %if.then ], [ %b, %0 ] + %e = select i1 %active, i32 %a, i32 %f + ret i32 %e +} + +define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) { +; DAGISEL-LABEL: ret_64: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_dual_cndmask_b32 v1, 0, v1 :: v_dual_cndmask_b32 v0, 5, v0 +; DAGISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; DAGISEL-NEXT: s_clause 0x3 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: ret_64: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_cndmask_b32 v0, 5, v0 :: v_dual_cndmask_b32 v1, 0, v1 +; GISEL-NEXT: v_dual_cndmask_b32 v2, 3, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL-NEXT: s_xor_b32 exec_lo, vcc_lo, -1 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL-NEXT: s_mov_b32 exec_lo, vcc_lo +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: ret_64: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; DAGISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; DAGISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; DAGISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; DAGISEL64-NEXT: s_clause 0x3 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL64-NEXT: s_mov_b64 exec, vcc +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: ret_64: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v2, 3, v2, vcc +; GISEL64-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL64-NEXT: v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GISEL64-NEXT: s_xor_b64 exec, vcc, -1 +; GISEL64-NEXT: s_clause 0x3 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL64-NEXT: s_mov_b64 exec, vcc +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %x = select i1 %active, i64 %a, i64 5 + %y = select i1 %active, i64 %b, i64 3 + %ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false) + ret i64 %ret +} + +define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i32> inreg %v4i32, float inreg %float, ptr addrspace(5) inreg %ptr, ptr addrspace(5) inreg %ptr2) { +; DAGISEL-LABEL: inreg_args: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s9 +; DAGISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s6 +; DAGISEL-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, s8 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s10 +; DAGISEL-NEXT: s_clause 0x1 +; DAGISEL-NEXT: scratch_store_b128 off, v[0:3], s11 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s11 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: s_clause 0x5 +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: inreg_args: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s34, -1 +; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: s_mov_b32 s0, s5 +; GISEL-NEXT: s_mov_b32 s1, s6 +; GISEL-NEXT: s_mov_b32 s2, s7 +; GISEL-NEXT: s_mov_b32 s3, s8 +; GISEL-NEXT: v_mov_b32_e32 v4, s4 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3 +; GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 +; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: scratch_store_b32 off, v4, s10 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: scratch_store_b128 off, v[0:3], s11 +; GISEL-NEXT: scratch_store_b32 off, v5, s11 +; GISEL-NEXT: s_xor_b32 exec_lo, s34, -1 +; GISEL-NEXT: s_clause 0x5 +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL-NEXT: s_mov_b32 exec_lo, s34 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: inreg_args: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_mov_b32_e32 v4, s4 +; DAGISEL64-NEXT: v_mov_b32_e32 v0, s5 +; DAGISEL64-NEXT: v_mov_b32_e32 v1, s6 +; DAGISEL64-NEXT: v_mov_b32_e32 v2, s7 +; DAGISEL64-NEXT: v_mov_b32_e32 v3, s8 +; DAGISEL64-NEXT: v_mov_b32_e32 v5, s9 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s10 +; DAGISEL64-NEXT: s_clause 0x1 +; DAGISEL64-NEXT: scratch_store_b128 off, v[0:3], s11 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s11 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x5 +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: inreg_args: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[34:35], -1 +; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: s_mov_b32 s0, s5 +; GISEL64-NEXT: s_mov_b32 s1, s6 +; GISEL64-NEXT: s_mov_b32 s2, s7 +; GISEL64-NEXT: s_mov_b32 s3, s8 +; GISEL64-NEXT: v_mov_b32_e32 v4, s4 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_mov_b32_e32 v0, s0 +; GISEL64-NEXT: v_mov_b32_e32 v1, s1 +; GISEL64-NEXT: v_mov_b32_e32 v2, s2 +; GISEL64-NEXT: v_mov_b32_e32 v3, s3 +; GISEL64-NEXT: v_mov_b32_e32 v5, s9 +; GISEL64-NEXT: scratch_store_b32 off, v4, s10 +; GISEL64-NEXT: s_clause 0x1 +; GISEL64-NEXT: scratch_store_b128 off, v[0:3], s11 +; GISEL64-NEXT: scratch_store_b32 off, v5, s11 +; GISEL64-NEXT: s_xor_b64 exec, s[34:35], -1 +; GISEL64-NEXT: s_clause 0x5 +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL64-NEXT: s_mov_b64 exec, s[34:35] +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_setpc_b64 s[30:31] + store i32 %i32, ptr addrspace(5) %ptr + store <4 x i32> %v4i32, ptr addrspace(5) %ptr2 + store float %float, ptr addrspace(5) %ptr2 + ret void +} + +declare amdgpu_gfx <2 x half> @gfx_callee(<2 x half> %x, <2 x half> %y) + +define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) { +; DAGISEL-LABEL: call_gfx_from_whole_wave: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_mov_b32 s0, s33 +; DAGISEL-NEXT: s_mov_b32 s33, s32 +; DAGISEL-NEXT: s_xor_saveexec_b32 s4, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; DAGISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; DAGISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; DAGISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; DAGISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; DAGISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; DAGISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; DAGISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; DAGISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; DAGISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; DAGISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; DAGISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; DAGISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; DAGISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; DAGISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; DAGISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; DAGISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; DAGISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; DAGISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; DAGISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; DAGISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; DAGISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; DAGISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; DAGISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; DAGISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; DAGISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; DAGISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; DAGISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; DAGISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; DAGISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; DAGISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; DAGISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; DAGISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; DAGISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; DAGISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; DAGISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; DAGISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; DAGISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; DAGISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; DAGISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; DAGISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; DAGISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; DAGISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; DAGISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; DAGISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; DAGISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; DAGISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; DAGISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; DAGISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; DAGISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; DAGISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; DAGISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; DAGISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; DAGISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; DAGISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; DAGISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; DAGISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; DAGISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; DAGISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; DAGISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; DAGISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; DAGISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; DAGISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; DAGISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; DAGISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; DAGISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; DAGISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; DAGISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; DAGISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; DAGISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; DAGISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; DAGISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; DAGISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; DAGISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; DAGISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; DAGISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; DAGISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; DAGISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; DAGISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; DAGISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; DAGISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; DAGISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; DAGISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; DAGISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; DAGISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; DAGISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; DAGISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; DAGISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; DAGISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; DAGISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; DAGISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; DAGISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; DAGISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; DAGISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; DAGISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; DAGISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; DAGISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; DAGISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; DAGISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; DAGISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; DAGISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; DAGISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; DAGISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; DAGISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; DAGISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; DAGISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; DAGISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; DAGISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; DAGISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; DAGISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; DAGISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; DAGISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; DAGISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; DAGISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; DAGISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; DAGISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; DAGISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; DAGISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; DAGISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; DAGISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; DAGISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; DAGISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; DAGISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; DAGISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; DAGISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; DAGISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; DAGISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; DAGISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; DAGISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; DAGISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; DAGISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; DAGISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; DAGISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; DAGISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; DAGISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: v_writelane_b32 v40, s0, 3 +; DAGISEL-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL-NEXT: v_swap_b32 v0, v1 +; DAGISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; DAGISEL-NEXT: v_writelane_b32 v40, s4, 0 +; DAGISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; DAGISEL-NEXT: s_addk_co_i32 s32, 0x250 +; DAGISEL-NEXT: v_writelane_b32 v40, s30, 1 +; DAGISEL-NEXT: v_writelane_b32 v40, s31, 2 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL-NEXT: v_readlane_b32 s31, v40, 2 +; DAGISEL-NEXT: v_readlane_b32 s30, v40, 1 +; DAGISEL-NEXT: v_readlane_b32 s4, v40, 0 +; DAGISEL-NEXT: v_readlane_b32 s0, v40, 3 +; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; DAGISEL-NEXT: s_mov_b32 s32, s33 +; DAGISEL-NEXT: s_xor_b32 exec_lo, s4, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; DAGISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; DAGISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; DAGISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; DAGISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; DAGISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; DAGISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; DAGISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; DAGISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; DAGISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; DAGISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; DAGISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; DAGISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; DAGISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; DAGISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; DAGISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; DAGISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; DAGISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; DAGISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; DAGISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; DAGISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; DAGISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; DAGISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; DAGISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; DAGISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; DAGISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; DAGISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; DAGISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; DAGISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; DAGISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; DAGISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; DAGISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; DAGISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; DAGISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; DAGISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; DAGISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; DAGISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; DAGISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; DAGISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; DAGISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; DAGISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; DAGISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; DAGISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; DAGISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; DAGISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; DAGISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; DAGISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; DAGISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; DAGISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; DAGISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; DAGISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; DAGISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; DAGISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; DAGISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; DAGISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; DAGISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; DAGISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; DAGISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; DAGISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; DAGISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; DAGISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; DAGISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; DAGISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; DAGISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; DAGISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; DAGISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; DAGISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; DAGISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; DAGISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; DAGISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; DAGISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; DAGISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; DAGISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; DAGISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; DAGISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; DAGISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; DAGISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; DAGISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; DAGISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; DAGISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; DAGISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; DAGISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; DAGISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; DAGISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; DAGISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; DAGISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; DAGISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; DAGISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; DAGISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; DAGISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; DAGISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; DAGISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; DAGISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; DAGISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; DAGISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; DAGISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; DAGISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; DAGISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; DAGISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; DAGISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; DAGISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; DAGISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; DAGISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; DAGISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; DAGISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; DAGISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; DAGISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; DAGISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; DAGISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; DAGISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; DAGISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; DAGISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; DAGISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; DAGISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; DAGISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; DAGISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; DAGISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; DAGISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; DAGISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; DAGISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; DAGISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; DAGISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; DAGISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; DAGISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; DAGISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; DAGISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; DAGISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; DAGISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; DAGISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; DAGISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; DAGISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; DAGISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; DAGISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; DAGISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; DAGISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; DAGISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; DAGISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s4 +; DAGISEL-NEXT: s_mov_b32 s33, s0 +; DAGISEL-NEXT: s_wait_loadcnt 0x0 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: call_gfx_from_whole_wave: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_mov_b32 s0, s33 +; GISEL-NEXT: s_mov_b32 s33, s32 +; GISEL-NEXT: s_xor_saveexec_b32 s4, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; GISEL-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; GISEL-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; GISEL-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; GISEL-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; GISEL-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; GISEL-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; GISEL-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; GISEL-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; GISEL-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; GISEL-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; GISEL-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; GISEL-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; GISEL-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; GISEL-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; GISEL-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; GISEL-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; GISEL-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; GISEL-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; GISEL-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; GISEL-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; GISEL-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; GISEL-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; GISEL-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; GISEL-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; GISEL-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; GISEL-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; GISEL-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; GISEL-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; GISEL-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; GISEL-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; GISEL-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; GISEL-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; GISEL-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; GISEL-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; GISEL-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; GISEL-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; GISEL-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; GISEL-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; GISEL-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; GISEL-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; GISEL-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; GISEL-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; GISEL-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; GISEL-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; GISEL-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; GISEL-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; GISEL-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; GISEL-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; GISEL-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; GISEL-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; GISEL-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; GISEL-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; GISEL-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; GISEL-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; GISEL-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; GISEL-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; GISEL-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; GISEL-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; GISEL-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; GISEL-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; GISEL-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; GISEL-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; GISEL-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; GISEL-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; GISEL-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; GISEL-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; GISEL-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; GISEL-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; GISEL-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; GISEL-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; GISEL-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; GISEL-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; GISEL-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; GISEL-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; GISEL-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; GISEL-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; GISEL-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; GISEL-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; GISEL-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; GISEL-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; GISEL-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; GISEL-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; GISEL-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; GISEL-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; GISEL-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; GISEL-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; GISEL-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; GISEL-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; GISEL-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; GISEL-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; GISEL-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; GISEL-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; GISEL-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; GISEL-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; GISEL-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; GISEL-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; GISEL-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; GISEL-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; GISEL-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; GISEL-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; GISEL-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; GISEL-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; GISEL-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; GISEL-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; GISEL-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; GISEL-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; GISEL-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; GISEL-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; GISEL-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; GISEL-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; GISEL-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; GISEL-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; GISEL-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; GISEL-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; GISEL-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; GISEL-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; GISEL-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; GISEL-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; GISEL-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; GISEL-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; GISEL-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; GISEL-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; GISEL-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; GISEL-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; GISEL-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; GISEL-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; GISEL-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; GISEL-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; GISEL-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; GISEL-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; GISEL-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; GISEL-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; GISEL-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; GISEL-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; GISEL-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; GISEL-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; GISEL-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: v_writelane_b32 v40, s0, 3 +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: v_swap_b32 v0, v1 +; GISEL-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; GISEL-NEXT: v_writelane_b32 v40, s4, 0 +; GISEL-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; GISEL-NEXT: s_addk_co_i32 s32, 0x250 +; GISEL-NEXT: v_writelane_b32 v40, s30, 1 +; GISEL-NEXT: v_writelane_b32 v40, s31, 2 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_readlane_b32 s31, v40, 2 +; GISEL-NEXT: v_readlane_b32 s30, v40, 1 +; GISEL-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL-NEXT: v_readlane_b32 s0, v40, 3 +; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GISEL-NEXT: s_mov_b32 s32, s33 +; GISEL-NEXT: s_xor_b32 exec_lo, s4, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; GISEL-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; GISEL-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; GISEL-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; GISEL-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; GISEL-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; GISEL-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; GISEL-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; GISEL-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; GISEL-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; GISEL-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; GISEL-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; GISEL-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; GISEL-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; GISEL-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; GISEL-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; GISEL-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; GISEL-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; GISEL-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; GISEL-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; GISEL-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; GISEL-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; GISEL-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; GISEL-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; GISEL-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; GISEL-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; GISEL-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; GISEL-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; GISEL-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; GISEL-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; GISEL-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; GISEL-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; GISEL-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; GISEL-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; GISEL-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; GISEL-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; GISEL-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; GISEL-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; GISEL-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; GISEL-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; GISEL-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; GISEL-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; GISEL-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; GISEL-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; GISEL-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; GISEL-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; GISEL-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; GISEL-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; GISEL-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; GISEL-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; GISEL-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; GISEL-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; GISEL-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; GISEL-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; GISEL-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; GISEL-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; GISEL-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; GISEL-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; GISEL-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; GISEL-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; GISEL-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; GISEL-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; GISEL-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; GISEL-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; GISEL-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; GISEL-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; GISEL-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; GISEL-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; GISEL-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; GISEL-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; GISEL-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; GISEL-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; GISEL-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; GISEL-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; GISEL-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; GISEL-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; GISEL-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; GISEL-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; GISEL-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; GISEL-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; GISEL-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; GISEL-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; GISEL-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; GISEL-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; GISEL-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; GISEL-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; GISEL-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; GISEL-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; GISEL-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; GISEL-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; GISEL-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; GISEL-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; GISEL-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; GISEL-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; GISEL-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; GISEL-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; GISEL-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; GISEL-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; GISEL-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; GISEL-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; GISEL-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; GISEL-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; GISEL-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; GISEL-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; GISEL-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; GISEL-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; GISEL-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; GISEL-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; GISEL-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; GISEL-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; GISEL-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; GISEL-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; GISEL-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; GISEL-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; GISEL-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; GISEL-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; GISEL-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; GISEL-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; GISEL-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; GISEL-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; GISEL-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; GISEL-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; GISEL-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; GISEL-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; GISEL-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; GISEL-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; GISEL-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; GISEL-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; GISEL-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; GISEL-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; GISEL-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; GISEL-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; GISEL-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; GISEL-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; GISEL-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; GISEL-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; GISEL-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; GISEL-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; GISEL-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; GISEL-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; GISEL-NEXT: s_mov_b32 exec_lo, s4 +; GISEL-NEXT: s_mov_b32 s33, s0 +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_setpc_b64 s[30:31] +; +; DAGISEL64-LABEL: call_gfx_from_whole_wave: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_mov_b32 s0, s33 +; DAGISEL64-NEXT: s_mov_b32 s33, s32 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; DAGISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; DAGISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; DAGISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; DAGISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; DAGISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; DAGISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; DAGISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; DAGISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; DAGISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; DAGISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; DAGISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; DAGISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; DAGISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; DAGISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; DAGISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; DAGISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; DAGISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; DAGISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; DAGISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; DAGISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; DAGISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; DAGISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; DAGISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; DAGISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; DAGISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; DAGISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; DAGISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; DAGISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; DAGISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; DAGISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; DAGISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; DAGISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; DAGISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; DAGISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; DAGISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; DAGISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; DAGISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; DAGISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; DAGISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; DAGISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; DAGISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; DAGISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; DAGISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; DAGISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; DAGISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; DAGISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; DAGISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; DAGISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; DAGISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; DAGISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; DAGISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; DAGISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; DAGISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; DAGISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; DAGISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; DAGISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; DAGISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; DAGISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; DAGISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; DAGISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; DAGISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; DAGISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; DAGISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; DAGISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; DAGISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; DAGISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; DAGISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; DAGISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; DAGISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; DAGISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; DAGISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; DAGISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; DAGISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; DAGISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; DAGISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; DAGISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; DAGISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; DAGISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; DAGISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; DAGISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; DAGISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; DAGISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; DAGISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; DAGISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; DAGISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; DAGISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; DAGISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; DAGISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; DAGISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; DAGISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; DAGISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; DAGISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; DAGISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; DAGISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; DAGISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; DAGISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; DAGISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; DAGISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; DAGISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; DAGISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; DAGISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; DAGISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; DAGISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; DAGISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; DAGISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; DAGISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; DAGISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; DAGISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; DAGISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; DAGISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; DAGISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; DAGISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; DAGISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; DAGISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; DAGISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; DAGISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; DAGISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; DAGISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; DAGISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; DAGISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; DAGISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; DAGISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; DAGISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; DAGISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; DAGISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; DAGISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; DAGISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; DAGISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; DAGISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; DAGISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; DAGISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; DAGISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; DAGISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; DAGISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: v_writelane_b32 v40, s0, 4 +; DAGISEL64-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL64-NEXT: v_swap_b32 v0, v1 +; DAGISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; DAGISEL64-NEXT: v_writelane_b32 v40, s4, 0 +; DAGISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; DAGISEL64-NEXT: s_addk_co_i32 s32, 0x250 +; DAGISEL64-NEXT: v_writelane_b32 v40, s5, 1 +; DAGISEL64-NEXT: v_writelane_b32 v40, s30, 2 +; DAGISEL64-NEXT: v_writelane_b32 v40, s31, 3 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; DAGISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; DAGISEL64-NEXT: v_readlane_b32 s31, v40, 3 +; DAGISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; DAGISEL64-NEXT: v_readlane_b32 s5, v40, 1 +; DAGISEL64-NEXT: v_readlane_b32 s4, v40, 0 +; DAGISEL64-NEXT: v_readlane_b32 s0, v40, 4 +; DAGISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; DAGISEL64-NEXT: s_mov_b32 s32, s33 +; DAGISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; DAGISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; DAGISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; DAGISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; DAGISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; DAGISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; DAGISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; DAGISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; DAGISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; DAGISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; DAGISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; DAGISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; DAGISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; DAGISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; DAGISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; DAGISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; DAGISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; DAGISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; DAGISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; DAGISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; DAGISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; DAGISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; DAGISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; DAGISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; DAGISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; DAGISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; DAGISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; DAGISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; DAGISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; DAGISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; DAGISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; DAGISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; DAGISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; DAGISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; DAGISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; DAGISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; DAGISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; DAGISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; DAGISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; DAGISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; DAGISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; DAGISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; DAGISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; DAGISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; DAGISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; DAGISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; DAGISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; DAGISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; DAGISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; DAGISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; DAGISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; DAGISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; DAGISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; DAGISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; DAGISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; DAGISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; DAGISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; DAGISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; DAGISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; DAGISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; DAGISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; DAGISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; DAGISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; DAGISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; DAGISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; DAGISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; DAGISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; DAGISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; DAGISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; DAGISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; DAGISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; DAGISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; DAGISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; DAGISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; DAGISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; DAGISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; DAGISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; DAGISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; DAGISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; DAGISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; DAGISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; DAGISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; DAGISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; DAGISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; DAGISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; DAGISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; DAGISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; DAGISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; DAGISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; DAGISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; DAGISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; DAGISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; DAGISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; DAGISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; DAGISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; DAGISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; DAGISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; DAGISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; DAGISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; DAGISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; DAGISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; DAGISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; DAGISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; DAGISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; DAGISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; DAGISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; DAGISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; DAGISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; DAGISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; DAGISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; DAGISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; DAGISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; DAGISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; DAGISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; DAGISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; DAGISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; DAGISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; DAGISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; DAGISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; DAGISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; DAGISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; DAGISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; DAGISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; DAGISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; DAGISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; DAGISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; DAGISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; DAGISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; DAGISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; DAGISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; DAGISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; DAGISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; DAGISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; DAGISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; DAGISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; DAGISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; DAGISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; DAGISEL64-NEXT: s_mov_b64 exec, s[4:5] +; DAGISEL64-NEXT: s_mov_b32 s33, s0 +; DAGISEL64-NEXT: s_wait_loadcnt 0x0 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_setpc_b64 s[30:31] +; +; GISEL64-LABEL: call_gfx_from_whole_wave: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_mov_b32 s0, s33 +; GISEL64-NEXT: s_mov_b32 s33, s32 +; GISEL64-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v0, s33 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v1, s33 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v2, s33 offset:12 +; GISEL64-NEXT: scratch_store_b32 off, v3, s33 offset:16 +; GISEL64-NEXT: scratch_store_b32 off, v4, s33 offset:20 +; GISEL64-NEXT: scratch_store_b32 off, v5, s33 offset:24 +; GISEL64-NEXT: scratch_store_b32 off, v6, s33 offset:28 +; GISEL64-NEXT: scratch_store_b32 off, v7, s33 offset:32 +; GISEL64-NEXT: scratch_store_b32 off, v8, s33 offset:36 +; GISEL64-NEXT: scratch_store_b32 off, v9, s33 offset:40 +; GISEL64-NEXT: scratch_store_b32 off, v10, s33 offset:44 +; GISEL64-NEXT: scratch_store_b32 off, v11, s33 offset:48 +; GISEL64-NEXT: scratch_store_b32 off, v12, s33 offset:52 +; GISEL64-NEXT: scratch_store_b32 off, v13, s33 offset:56 +; GISEL64-NEXT: scratch_store_b32 off, v14, s33 offset:60 +; GISEL64-NEXT: scratch_store_b32 off, v15, s33 offset:64 +; GISEL64-NEXT: scratch_store_b32 off, v16, s33 offset:68 +; GISEL64-NEXT: scratch_store_b32 off, v17, s33 offset:72 +; GISEL64-NEXT: scratch_store_b32 off, v18, s33 offset:76 +; GISEL64-NEXT: scratch_store_b32 off, v19, s33 offset:80 +; GISEL64-NEXT: scratch_store_b32 off, v20, s33 offset:84 +; GISEL64-NEXT: scratch_store_b32 off, v21, s33 offset:88 +; GISEL64-NEXT: scratch_store_b32 off, v22, s33 offset:92 +; GISEL64-NEXT: scratch_store_b32 off, v23, s33 offset:96 +; GISEL64-NEXT: scratch_store_b32 off, v24, s33 offset:100 +; GISEL64-NEXT: scratch_store_b32 off, v25, s33 offset:104 +; GISEL64-NEXT: scratch_store_b32 off, v26, s33 offset:108 +; GISEL64-NEXT: scratch_store_b32 off, v27, s33 offset:112 +; GISEL64-NEXT: scratch_store_b32 off, v28, s33 offset:116 +; GISEL64-NEXT: scratch_store_b32 off, v29, s33 offset:120 +; GISEL64-NEXT: scratch_store_b32 off, v30, s33 offset:124 +; GISEL64-NEXT: scratch_store_b32 off, v31, s33 offset:128 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v32, s33 offset:132 +; GISEL64-NEXT: scratch_store_b32 off, v33, s33 offset:136 +; GISEL64-NEXT: scratch_store_b32 off, v34, s33 offset:140 +; GISEL64-NEXT: scratch_store_b32 off, v35, s33 offset:144 +; GISEL64-NEXT: scratch_store_b32 off, v36, s33 offset:148 +; GISEL64-NEXT: scratch_store_b32 off, v37, s33 offset:152 +; GISEL64-NEXT: scratch_store_b32 off, v38, s33 offset:156 +; GISEL64-NEXT: scratch_store_b32 off, v39, s33 offset:160 +; GISEL64-NEXT: scratch_store_b32 off, v48, s33 offset:164 +; GISEL64-NEXT: scratch_store_b32 off, v49, s33 offset:168 +; GISEL64-NEXT: scratch_store_b32 off, v50, s33 offset:172 +; GISEL64-NEXT: scratch_store_b32 off, v51, s33 offset:176 +; GISEL64-NEXT: scratch_store_b32 off, v52, s33 offset:180 +; GISEL64-NEXT: scratch_store_b32 off, v53, s33 offset:184 +; GISEL64-NEXT: scratch_store_b32 off, v54, s33 offset:188 +; GISEL64-NEXT: scratch_store_b32 off, v55, s33 offset:192 +; GISEL64-NEXT: scratch_store_b32 off, v64, s33 offset:196 +; GISEL64-NEXT: scratch_store_b32 off, v65, s33 offset:200 +; GISEL64-NEXT: scratch_store_b32 off, v66, s33 offset:204 +; GISEL64-NEXT: scratch_store_b32 off, v67, s33 offset:208 +; GISEL64-NEXT: scratch_store_b32 off, v68, s33 offset:212 +; GISEL64-NEXT: scratch_store_b32 off, v69, s33 offset:216 +; GISEL64-NEXT: scratch_store_b32 off, v70, s33 offset:220 +; GISEL64-NEXT: scratch_store_b32 off, v71, s33 offset:224 +; GISEL64-NEXT: scratch_store_b32 off, v80, s33 offset:228 +; GISEL64-NEXT: scratch_store_b32 off, v81, s33 offset:232 +; GISEL64-NEXT: scratch_store_b32 off, v82, s33 offset:236 +; GISEL64-NEXT: scratch_store_b32 off, v83, s33 offset:240 +; GISEL64-NEXT: scratch_store_b32 off, v84, s33 offset:244 +; GISEL64-NEXT: scratch_store_b32 off, v85, s33 offset:248 +; GISEL64-NEXT: scratch_store_b32 off, v86, s33 offset:252 +; GISEL64-NEXT: scratch_store_b32 off, v87, s33 offset:256 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v96, s33 offset:260 +; GISEL64-NEXT: scratch_store_b32 off, v97, s33 offset:264 +; GISEL64-NEXT: scratch_store_b32 off, v98, s33 offset:268 +; GISEL64-NEXT: scratch_store_b32 off, v99, s33 offset:272 +; GISEL64-NEXT: scratch_store_b32 off, v100, s33 offset:276 +; GISEL64-NEXT: scratch_store_b32 off, v101, s33 offset:280 +; GISEL64-NEXT: scratch_store_b32 off, v102, s33 offset:284 +; GISEL64-NEXT: scratch_store_b32 off, v103, s33 offset:288 +; GISEL64-NEXT: scratch_store_b32 off, v112, s33 offset:292 +; GISEL64-NEXT: scratch_store_b32 off, v113, s33 offset:296 +; GISEL64-NEXT: scratch_store_b32 off, v114, s33 offset:300 +; GISEL64-NEXT: scratch_store_b32 off, v115, s33 offset:304 +; GISEL64-NEXT: scratch_store_b32 off, v116, s33 offset:308 +; GISEL64-NEXT: scratch_store_b32 off, v117, s33 offset:312 +; GISEL64-NEXT: scratch_store_b32 off, v118, s33 offset:316 +; GISEL64-NEXT: scratch_store_b32 off, v119, s33 offset:320 +; GISEL64-NEXT: scratch_store_b32 off, v128, s33 offset:324 +; GISEL64-NEXT: scratch_store_b32 off, v129, s33 offset:328 +; GISEL64-NEXT: scratch_store_b32 off, v130, s33 offset:332 +; GISEL64-NEXT: scratch_store_b32 off, v131, s33 offset:336 +; GISEL64-NEXT: scratch_store_b32 off, v132, s33 offset:340 +; GISEL64-NEXT: scratch_store_b32 off, v133, s33 offset:344 +; GISEL64-NEXT: scratch_store_b32 off, v134, s33 offset:348 +; GISEL64-NEXT: scratch_store_b32 off, v135, s33 offset:352 +; GISEL64-NEXT: scratch_store_b32 off, v144, s33 offset:356 +; GISEL64-NEXT: scratch_store_b32 off, v145, s33 offset:360 +; GISEL64-NEXT: scratch_store_b32 off, v146, s33 offset:364 +; GISEL64-NEXT: scratch_store_b32 off, v147, s33 offset:368 +; GISEL64-NEXT: scratch_store_b32 off, v148, s33 offset:372 +; GISEL64-NEXT: scratch_store_b32 off, v149, s33 offset:376 +; GISEL64-NEXT: scratch_store_b32 off, v150, s33 offset:380 +; GISEL64-NEXT: scratch_store_b32 off, v151, s33 offset:384 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v160, s33 offset:388 +; GISEL64-NEXT: scratch_store_b32 off, v161, s33 offset:392 +; GISEL64-NEXT: scratch_store_b32 off, v162, s33 offset:396 +; GISEL64-NEXT: scratch_store_b32 off, v163, s33 offset:400 +; GISEL64-NEXT: scratch_store_b32 off, v164, s33 offset:404 +; GISEL64-NEXT: scratch_store_b32 off, v165, s33 offset:408 +; GISEL64-NEXT: scratch_store_b32 off, v166, s33 offset:412 +; GISEL64-NEXT: scratch_store_b32 off, v167, s33 offset:416 +; GISEL64-NEXT: scratch_store_b32 off, v176, s33 offset:420 +; GISEL64-NEXT: scratch_store_b32 off, v177, s33 offset:424 +; GISEL64-NEXT: scratch_store_b32 off, v178, s33 offset:428 +; GISEL64-NEXT: scratch_store_b32 off, v179, s33 offset:432 +; GISEL64-NEXT: scratch_store_b32 off, v180, s33 offset:436 +; GISEL64-NEXT: scratch_store_b32 off, v181, s33 offset:440 +; GISEL64-NEXT: scratch_store_b32 off, v182, s33 offset:444 +; GISEL64-NEXT: scratch_store_b32 off, v183, s33 offset:448 +; GISEL64-NEXT: scratch_store_b32 off, v192, s33 offset:452 +; GISEL64-NEXT: scratch_store_b32 off, v193, s33 offset:456 +; GISEL64-NEXT: scratch_store_b32 off, v194, s33 offset:460 +; GISEL64-NEXT: scratch_store_b32 off, v195, s33 offset:464 +; GISEL64-NEXT: scratch_store_b32 off, v196, s33 offset:468 +; GISEL64-NEXT: scratch_store_b32 off, v197, s33 offset:472 +; GISEL64-NEXT: scratch_store_b32 off, v198, s33 offset:476 +; GISEL64-NEXT: scratch_store_b32 off, v199, s33 offset:480 +; GISEL64-NEXT: scratch_store_b32 off, v208, s33 offset:484 +; GISEL64-NEXT: scratch_store_b32 off, v209, s33 offset:488 +; GISEL64-NEXT: scratch_store_b32 off, v210, s33 offset:492 +; GISEL64-NEXT: scratch_store_b32 off, v211, s33 offset:496 +; GISEL64-NEXT: scratch_store_b32 off, v212, s33 offset:500 +; GISEL64-NEXT: scratch_store_b32 off, v213, s33 offset:504 +; GISEL64-NEXT: scratch_store_b32 off, v214, s33 offset:508 +; GISEL64-NEXT: scratch_store_b32 off, v215, s33 offset:512 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_store_b32 off, v224, s33 offset:516 +; GISEL64-NEXT: scratch_store_b32 off, v225, s33 offset:520 +; GISEL64-NEXT: scratch_store_b32 off, v226, s33 offset:524 +; GISEL64-NEXT: scratch_store_b32 off, v227, s33 offset:528 +; GISEL64-NEXT: scratch_store_b32 off, v228, s33 offset:532 +; GISEL64-NEXT: scratch_store_b32 off, v229, s33 offset:536 +; GISEL64-NEXT: scratch_store_b32 off, v230, s33 offset:540 +; GISEL64-NEXT: scratch_store_b32 off, v231, s33 offset:544 +; GISEL64-NEXT: scratch_store_b32 off, v240, s33 offset:548 +; GISEL64-NEXT: scratch_store_b32 off, v241, s33 offset:552 +; GISEL64-NEXT: scratch_store_b32 off, v242, s33 offset:556 +; GISEL64-NEXT: scratch_store_b32 off, v243, s33 offset:560 +; GISEL64-NEXT: scratch_store_b32 off, v244, s33 offset:564 +; GISEL64-NEXT: scratch_store_b32 off, v245, s33 offset:568 +; GISEL64-NEXT: scratch_store_b32 off, v246, s33 offset:572 +; GISEL64-NEXT: scratch_store_b32 off, v247, s33 offset:576 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: v_writelane_b32 v40, s0, 4 +; GISEL64-NEXT: v_mov_b32_e32 v2, v0 +; GISEL64-NEXT: v_swap_b32 v0, v1 +; GISEL64-NEXT: s_mov_b32 s0, gfx_callee@abs32@lo +; GISEL64-NEXT: v_writelane_b32 v40, s4, 0 +; GISEL64-NEXT: s_mov_b32 s1, gfx_callee@abs32@hi +; GISEL64-NEXT: s_addk_co_i32 s32, 0x250 +; GISEL64-NEXT: v_writelane_b32 v40, s5, 1 +; GISEL64-NEXT: v_writelane_b32 v40, s30, 2 +; GISEL64-NEXT: v_writelane_b32 v40, s31, 3 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GISEL64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL64-NEXT: v_readlane_b32 s31, v40, 3 +; GISEL64-NEXT: v_readlane_b32 s30, v40, 2 +; GISEL64-NEXT: v_readlane_b32 s5, v40, 1 +; GISEL64-NEXT: v_readlane_b32 s4, v40, 0 +; GISEL64-NEXT: v_readlane_b32 s0, v40, 4 +; GISEL64-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GISEL64-NEXT: s_mov_b32 s32, s33 +; GISEL64-NEXT: s_xor_b64 exec, s[4:5], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v0, off, s33 offset:4 +; GISEL64-NEXT: scratch_load_b32 v1, off, s33 offset:8 +; GISEL64-NEXT: scratch_load_b32 v2, off, s33 offset:12 +; GISEL64-NEXT: scratch_load_b32 v3, off, s33 offset:16 +; GISEL64-NEXT: scratch_load_b32 v4, off, s33 offset:20 +; GISEL64-NEXT: scratch_load_b32 v5, off, s33 offset:24 +; GISEL64-NEXT: scratch_load_b32 v6, off, s33 offset:28 +; GISEL64-NEXT: scratch_load_b32 v7, off, s33 offset:32 +; GISEL64-NEXT: scratch_load_b32 v8, off, s33 offset:36 +; GISEL64-NEXT: scratch_load_b32 v9, off, s33 offset:40 +; GISEL64-NEXT: scratch_load_b32 v10, off, s33 offset:44 +; GISEL64-NEXT: scratch_load_b32 v11, off, s33 offset:48 +; GISEL64-NEXT: scratch_load_b32 v12, off, s33 offset:52 +; GISEL64-NEXT: scratch_load_b32 v13, off, s33 offset:56 +; GISEL64-NEXT: scratch_load_b32 v14, off, s33 offset:60 +; GISEL64-NEXT: scratch_load_b32 v15, off, s33 offset:64 +; GISEL64-NEXT: scratch_load_b32 v16, off, s33 offset:68 +; GISEL64-NEXT: scratch_load_b32 v17, off, s33 offset:72 +; GISEL64-NEXT: scratch_load_b32 v18, off, s33 offset:76 +; GISEL64-NEXT: scratch_load_b32 v19, off, s33 offset:80 +; GISEL64-NEXT: scratch_load_b32 v20, off, s33 offset:84 +; GISEL64-NEXT: scratch_load_b32 v21, off, s33 offset:88 +; GISEL64-NEXT: scratch_load_b32 v22, off, s33 offset:92 +; GISEL64-NEXT: scratch_load_b32 v23, off, s33 offset:96 +; GISEL64-NEXT: scratch_load_b32 v24, off, s33 offset:100 +; GISEL64-NEXT: scratch_load_b32 v25, off, s33 offset:104 +; GISEL64-NEXT: scratch_load_b32 v26, off, s33 offset:108 +; GISEL64-NEXT: scratch_load_b32 v27, off, s33 offset:112 +; GISEL64-NEXT: scratch_load_b32 v28, off, s33 offset:116 +; GISEL64-NEXT: scratch_load_b32 v29, off, s33 offset:120 +; GISEL64-NEXT: scratch_load_b32 v30, off, s33 offset:124 +; GISEL64-NEXT: scratch_load_b32 v31, off, s33 offset:128 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v32, off, s33 offset:132 +; GISEL64-NEXT: scratch_load_b32 v33, off, s33 offset:136 +; GISEL64-NEXT: scratch_load_b32 v34, off, s33 offset:140 +; GISEL64-NEXT: scratch_load_b32 v35, off, s33 offset:144 +; GISEL64-NEXT: scratch_load_b32 v36, off, s33 offset:148 +; GISEL64-NEXT: scratch_load_b32 v37, off, s33 offset:152 +; GISEL64-NEXT: scratch_load_b32 v38, off, s33 offset:156 +; GISEL64-NEXT: scratch_load_b32 v39, off, s33 offset:160 +; GISEL64-NEXT: scratch_load_b32 v48, off, s33 offset:164 +; GISEL64-NEXT: scratch_load_b32 v49, off, s33 offset:168 +; GISEL64-NEXT: scratch_load_b32 v50, off, s33 offset:172 +; GISEL64-NEXT: scratch_load_b32 v51, off, s33 offset:176 +; GISEL64-NEXT: scratch_load_b32 v52, off, s33 offset:180 +; GISEL64-NEXT: scratch_load_b32 v53, off, s33 offset:184 +; GISEL64-NEXT: scratch_load_b32 v54, off, s33 offset:188 +; GISEL64-NEXT: scratch_load_b32 v55, off, s33 offset:192 +; GISEL64-NEXT: scratch_load_b32 v64, off, s33 offset:196 +; GISEL64-NEXT: scratch_load_b32 v65, off, s33 offset:200 +; GISEL64-NEXT: scratch_load_b32 v66, off, s33 offset:204 +; GISEL64-NEXT: scratch_load_b32 v67, off, s33 offset:208 +; GISEL64-NEXT: scratch_load_b32 v68, off, s33 offset:212 +; GISEL64-NEXT: scratch_load_b32 v69, off, s33 offset:216 +; GISEL64-NEXT: scratch_load_b32 v70, off, s33 offset:220 +; GISEL64-NEXT: scratch_load_b32 v71, off, s33 offset:224 +; GISEL64-NEXT: scratch_load_b32 v80, off, s33 offset:228 +; GISEL64-NEXT: scratch_load_b32 v81, off, s33 offset:232 +; GISEL64-NEXT: scratch_load_b32 v82, off, s33 offset:236 +; GISEL64-NEXT: scratch_load_b32 v83, off, s33 offset:240 +; GISEL64-NEXT: scratch_load_b32 v84, off, s33 offset:244 +; GISEL64-NEXT: scratch_load_b32 v85, off, s33 offset:248 +; GISEL64-NEXT: scratch_load_b32 v86, off, s33 offset:252 +; GISEL64-NEXT: scratch_load_b32 v87, off, s33 offset:256 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v96, off, s33 offset:260 +; GISEL64-NEXT: scratch_load_b32 v97, off, s33 offset:264 +; GISEL64-NEXT: scratch_load_b32 v98, off, s33 offset:268 +; GISEL64-NEXT: scratch_load_b32 v99, off, s33 offset:272 +; GISEL64-NEXT: scratch_load_b32 v100, off, s33 offset:276 +; GISEL64-NEXT: scratch_load_b32 v101, off, s33 offset:280 +; GISEL64-NEXT: scratch_load_b32 v102, off, s33 offset:284 +; GISEL64-NEXT: scratch_load_b32 v103, off, s33 offset:288 +; GISEL64-NEXT: scratch_load_b32 v112, off, s33 offset:292 +; GISEL64-NEXT: scratch_load_b32 v113, off, s33 offset:296 +; GISEL64-NEXT: scratch_load_b32 v114, off, s33 offset:300 +; GISEL64-NEXT: scratch_load_b32 v115, off, s33 offset:304 +; GISEL64-NEXT: scratch_load_b32 v116, off, s33 offset:308 +; GISEL64-NEXT: scratch_load_b32 v117, off, s33 offset:312 +; GISEL64-NEXT: scratch_load_b32 v118, off, s33 offset:316 +; GISEL64-NEXT: scratch_load_b32 v119, off, s33 offset:320 +; GISEL64-NEXT: scratch_load_b32 v128, off, s33 offset:324 +; GISEL64-NEXT: scratch_load_b32 v129, off, s33 offset:328 +; GISEL64-NEXT: scratch_load_b32 v130, off, s33 offset:332 +; GISEL64-NEXT: scratch_load_b32 v131, off, s33 offset:336 +; GISEL64-NEXT: scratch_load_b32 v132, off, s33 offset:340 +; GISEL64-NEXT: scratch_load_b32 v133, off, s33 offset:344 +; GISEL64-NEXT: scratch_load_b32 v134, off, s33 offset:348 +; GISEL64-NEXT: scratch_load_b32 v135, off, s33 offset:352 +; GISEL64-NEXT: scratch_load_b32 v144, off, s33 offset:356 +; GISEL64-NEXT: scratch_load_b32 v145, off, s33 offset:360 +; GISEL64-NEXT: scratch_load_b32 v146, off, s33 offset:364 +; GISEL64-NEXT: scratch_load_b32 v147, off, s33 offset:368 +; GISEL64-NEXT: scratch_load_b32 v148, off, s33 offset:372 +; GISEL64-NEXT: scratch_load_b32 v149, off, s33 offset:376 +; GISEL64-NEXT: scratch_load_b32 v150, off, s33 offset:380 +; GISEL64-NEXT: scratch_load_b32 v151, off, s33 offset:384 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v160, off, s33 offset:388 +; GISEL64-NEXT: scratch_load_b32 v161, off, s33 offset:392 +; GISEL64-NEXT: scratch_load_b32 v162, off, s33 offset:396 +; GISEL64-NEXT: scratch_load_b32 v163, off, s33 offset:400 +; GISEL64-NEXT: scratch_load_b32 v164, off, s33 offset:404 +; GISEL64-NEXT: scratch_load_b32 v165, off, s33 offset:408 +; GISEL64-NEXT: scratch_load_b32 v166, off, s33 offset:412 +; GISEL64-NEXT: scratch_load_b32 v167, off, s33 offset:416 +; GISEL64-NEXT: scratch_load_b32 v176, off, s33 offset:420 +; GISEL64-NEXT: scratch_load_b32 v177, off, s33 offset:424 +; GISEL64-NEXT: scratch_load_b32 v178, off, s33 offset:428 +; GISEL64-NEXT: scratch_load_b32 v179, off, s33 offset:432 +; GISEL64-NEXT: scratch_load_b32 v180, off, s33 offset:436 +; GISEL64-NEXT: scratch_load_b32 v181, off, s33 offset:440 +; GISEL64-NEXT: scratch_load_b32 v182, off, s33 offset:444 +; GISEL64-NEXT: scratch_load_b32 v183, off, s33 offset:448 +; GISEL64-NEXT: scratch_load_b32 v192, off, s33 offset:452 +; GISEL64-NEXT: scratch_load_b32 v193, off, s33 offset:456 +; GISEL64-NEXT: scratch_load_b32 v194, off, s33 offset:460 +; GISEL64-NEXT: scratch_load_b32 v195, off, s33 offset:464 +; GISEL64-NEXT: scratch_load_b32 v196, off, s33 offset:468 +; GISEL64-NEXT: scratch_load_b32 v197, off, s33 offset:472 +; GISEL64-NEXT: scratch_load_b32 v198, off, s33 offset:476 +; GISEL64-NEXT: scratch_load_b32 v199, off, s33 offset:480 +; GISEL64-NEXT: scratch_load_b32 v208, off, s33 offset:484 +; GISEL64-NEXT: scratch_load_b32 v209, off, s33 offset:488 +; GISEL64-NEXT: scratch_load_b32 v210, off, s33 offset:492 +; GISEL64-NEXT: scratch_load_b32 v211, off, s33 offset:496 +; GISEL64-NEXT: scratch_load_b32 v212, off, s33 offset:500 +; GISEL64-NEXT: scratch_load_b32 v213, off, s33 offset:504 +; GISEL64-NEXT: scratch_load_b32 v214, off, s33 offset:508 +; GISEL64-NEXT: scratch_load_b32 v215, off, s33 offset:512 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_load_b32 v224, off, s33 offset:516 +; GISEL64-NEXT: scratch_load_b32 v225, off, s33 offset:520 +; GISEL64-NEXT: scratch_load_b32 v226, off, s33 offset:524 +; GISEL64-NEXT: scratch_load_b32 v227, off, s33 offset:528 +; GISEL64-NEXT: scratch_load_b32 v228, off, s33 offset:532 +; GISEL64-NEXT: scratch_load_b32 v229, off, s33 offset:536 +; GISEL64-NEXT: scratch_load_b32 v230, off, s33 offset:540 +; GISEL64-NEXT: scratch_load_b32 v231, off, s33 offset:544 +; GISEL64-NEXT: scratch_load_b32 v240, off, s33 offset:548 +; GISEL64-NEXT: scratch_load_b32 v241, off, s33 offset:552 +; GISEL64-NEXT: scratch_load_b32 v242, off, s33 offset:556 +; GISEL64-NEXT: scratch_load_b32 v243, off, s33 offset:560 +; GISEL64-NEXT: scratch_load_b32 v244, off, s33 offset:564 +; GISEL64-NEXT: scratch_load_b32 v245, off, s33 offset:568 +; GISEL64-NEXT: scratch_load_b32 v246, off, s33 offset:572 +; GISEL64-NEXT: scratch_load_b32 v247, off, s33 offset:576 +; GISEL64-NEXT: s_mov_b64 exec, s[4:5] +; GISEL64-NEXT: s_mov_b32 s33, s0 +; GISEL64-NEXT: s_wait_loadcnt 0x0 +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_setpc_b64 s[30:31] + %ret = call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent + ret <2 x half> %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir new file mode 100644 index 0000000..2f7a6e2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-coececution-valu-hazards.mir @@ -0,0 +1,902 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s + +# WMMA writes: D0, WMMA reads: A0/B0/Index0 +# VALU writes: D1, VALU reads: Use1 +# Hards could be: +# RAW: D0 overlaps Use1 +# WAW: D0 overlaps D1 +# WAR: A0/B0/Index0 overlaps D1 + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_valus_in_between + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec + ; GFX1250-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec + ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec + ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec + ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr26 = V_MOV_B32_e32 26, implicit $exec + $vgpr27 = V_MOV_B32_e32 27, implicit $exec + $vgpr28 = V_MOV_B32_e32 28, implicit $exec + $vgpr29 = V_MOV_B32_e32 29, implicit $exec + $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Use1_with_4_salus_in_between + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $vgpr25 = V_ADD_F32_e32 $vgpr24, $vgpr16, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr16 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr24, $vgpr25, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_valus_in_between + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr26 = V_MOV_B32_e32 26, implicit $exec + ; GFX1250-NEXT: $vgpr27 = V_MOV_B32_e32 27, implicit $exec + ; GFX1250-NEXT: $vgpr28 = V_MOV_B32_e32 28, implicit $exec + ; GFX1250-NEXT: $vgpr29 = V_MOV_B32_e32 29, implicit $exec + ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr26 = V_MOV_B32_e32 26, implicit $exec + $vgpr27 = V_MOV_B32_e32 27, implicit $exec + $vgpr28 = V_MOV_B32_e32 28, implicit $exec + $vgpr29 = V_MOV_B32_e32 29, implicit $exec + $vgpr31 = V_ADD_F32_e32 $vgpr22, $vgpr30, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Use1_with_4_salus_in_between + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $vgpr27 = V_ADD_F32_e32 $vgpr22, $vgpr26, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr22 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr26, $vgpr27, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_valus_in_between + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec + ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr42 = V_MOV_B32_e32 42, implicit $exec + $vgpr43 = V_MOV_B32_e32 43, implicit $exec + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_Use1_with_4_salus_in_between + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_NoF8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_NoF8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_valus_in_between + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec + ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec + ; GFX1250-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec + ; GFX1250-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec + ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec + ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr42 = V_MOV_B32_e32 42, implicit $exec + $vgpr43 = V_MOV_B32_e32 43, implicit $exec + $vgpr44 = V_MOV_B32_e32 44, implicit $exec + $vgpr45 = V_MOV_B32_e32 45, implicit $exec + $vgpr46 = V_MOV_B32_e32 46, implicit $exec + $vgpr47 = V_MOV_B32_e32 47, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Use1_with_8_salus_in_between + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: $sgpr4 = S_MOV_B32 4 + ; GFX1250-NEXT: $sgpr5 = S_MOV_B32 5 + ; GFX1250-NEXT: $sgpr6 = S_MOV_B32 6 + ; GFX1250-NEXT: $sgpr7 = S_MOV_B32 7 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $sgpr4 = S_MOV_B32 4 + $sgpr5 = S_MOV_B32 5 + $sgpr6 = S_MOV_B32 6 + $sgpr7 = S_MOV_B32 7 + $vgpr32 = V_ADD_F32_e32 $vgpr16, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr16 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr32, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_valus_in_between + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec + ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec + ; GFX1250-NEXT: $vgpr44 = V_MOV_B32_e32 44, implicit $exec + ; GFX1250-NEXT: $vgpr45 = V_MOV_B32_e32 45, implicit $exec + ; GFX1250-NEXT: $vgpr46 = V_MOV_B32_e32 46, implicit $exec + ; GFX1250-NEXT: $vgpr47 = V_MOV_B32_e32 47, implicit $exec + ; GFX1250-NEXT: $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 2, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr42 = V_MOV_B32_e32 42, implicit $exec + $vgpr43 = V_MOV_B32_e32 43, implicit $exec + $vgpr44 = V_MOV_B32_e32 44, implicit $exec + $vgpr45 = V_MOV_B32_e32 45, implicit $exec + $vgpr46 = V_MOV_B32_e32 46, implicit $exec + $vgpr47 = V_MOV_B32_e32 47, implicit $exec + $vgpr49 = V_ADD_F32_e32 $vgpr32, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Use1_with_8_salus_in_between + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 3 + ; GFX1250-NEXT: $sgpr4 = S_MOV_B32 4 + ; GFX1250-NEXT: $sgpr5 = S_MOV_B32 5 + ; GFX1250-NEXT: $sgpr6 = S_MOV_B32 6 + ; GFX1250-NEXT: $sgpr7 = S_MOV_B32 7 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 3 + $sgpr4 = S_MOV_B32 4 + $sgpr5 = S_MOV_B32 5 + $sgpr6 = S_MOV_B32 6 + $sgpr7 = S_MOV_B32 7 + $vgpr48 = V_ADD_F32_e32 $vgpr32, $vgpr47, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr16 = V_ADD_F32_e32 $vgpr47, $vgpr48, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_valus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Use1_with_2_salus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $vgpr34 = V_ADD_F32_e32 $vgpr24, $vgpr33, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr24 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_Index0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_Index0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32, 0, 0, 0, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr33, $vgpr34, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_valus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Use1_with_2_salus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $vgpr31 = V_ADD_F32_e32 $vgpr24, $vgpr30, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr24 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_Index0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_Index0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr28 = V_ADD_F32_e32 $vgpr30, $vgpr31, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr34 = V_ADD_F32_e32 $vgpr33, $vgpr24, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_valus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $vgpr40 = V_MOV_B32_e32 40, implicit $exec + ; GFX1250-NEXT: $vgpr41 = V_MOV_B32_e32 41, implicit $exec + ; GFX1250-NEXT: $vgpr42 = V_MOV_B32_e32 42, implicit $exec + ; GFX1250-NEXT: $vgpr43 = V_MOV_B32_e32 43, implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr40 = V_MOV_B32_e32 40, implicit $exec + $vgpr41 = V_MOV_B32_e32 41, implicit $exec + $vgpr42 = V_MOV_B32_e32 42, implicit $exec + $vgpr43 = V_MOV_B32_e32 43, implicit $exec + $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Use1_with_4_salus_in_between + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: $sgpr0 = S_MOV_B32 0 + ; GFX1250-NEXT: $sgpr1 = S_MOV_B32 1 + ; GFX1250-NEXT: $sgpr2 = S_MOV_B32 2 + ; GFX1250-NEXT: $sgpr3 = S_MOV_B32 4 + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 1 + $sgpr2 = S_MOV_B32 2 + $sgpr3 = S_MOV_B32 4 + $vgpr34 = V_ADD_F32_e32 $vgpr34, $vgpr24, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr24 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_A0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_B0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr8 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_Index0_overlaps_D1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir new file mode 100644 index 0000000..2032b98 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx1250-w32.mir @@ -0,0 +1,1430 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX1250 %s + +# For two conscutive wmma instructions, we need to insert one V_NOP instruction between +# them if matrix A, B or index of the second wmma are the same or overlap with previous +# wmma instruction’s D-matrix. + +--- +name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr4_vgpr5, 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr14_vgpr15, 8, killed $vgpr4_vgpr5, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x4_f32_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec + $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr2_vgpr3, 8, killed $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr4_vgpr5, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16_16x16x32_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr34_vgpr35_vgpr36_vgpr37 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 9, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_bf16f32_16x16x32_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec + $vgpr26_vgpr27_vgpr28_vgpr29 = V_WMMA_BF16F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr26_vgpr27, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_fp8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x64_bf8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fp8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_fb8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x64_bf8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X64_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_I32_16x16x64_IU8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_I32_16X16X64_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f32_16x16x32_f16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec + $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr16_vgpr17, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec + $vgpr30_vgpr31_vgpr32_vgpr33 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, 0, 0, 0, 0, implicit $exec + $vgpr38_vgpr39_vgpr40_vgpr41 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37, 8, killed $vgpr38_vgpr39_vgpr40_vgpr41, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_f16_16x16x32_f16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec + $vgpr22_vgpr23_vgpr24_vgpr25 = V_WMMA_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr22_vgpr23_vgpr24_vgpr25, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr22_vgpr23, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, 1, 2, 0, 0, implicit $exec +... + +--- +name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_wmma_F32_16x16x128_F8F6F4_F6F4_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 2, 2, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr32_vgpr33, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16_16x16x64_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35 = V_SWMMAC_BF16_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_bf16f32_16x16x64_bf16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_BF16F32_16X16X64_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_fp8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x128_bf8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_fp8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_fp8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51, killed $vgpr90_vgpr91, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x128_bf8_bf8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88_vgpr89, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_BF8_BF8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr90_vgpr91, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_i32_16x16x128_iu8_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88_vgpr89, 0, 0, 0, 0, implicit $exec + $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 = V_SWMMAC_F32_16X16X128_FP8_FP8_w32_twoaddr killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55, killed $vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, killed $vgpr24_vgpr25, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_f16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_f16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f32_16x16x64_f16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f32_16x16x64_f16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71 = V_SWMMAC_F32_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x64_f16_D0_overlaps_A1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_A1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x64_f16_D0_overlaps_B1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_B1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec +... + +--- +name: test_swmmac_f16_16x16x64_f16_D0_overlaps_Index1 +body: | + bb.0: + ; GFX1250-LABEL: name: test_swmmac_f16_16x16x64_f16_D0_overlaps_Index1 + ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: early-clobber $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27, killed $vgpr88, 0, 0, 0, 0, 0, implicit $exec + $vgpr64_vgpr65_vgpr66_vgpr67 = V_SWMMAC_F16_16X16X64_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr64_vgpr65_vgpr66_vgpr67, killed $vgpr24, 0, 0, 0, 0, 0, implicit $exec +... |