diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll | 1238 |
1 files changed, 722 insertions, 516 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll index 92af34f..beda16c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll @@ -1,13 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-SDAG,GFX942-VGPRCD-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-GISEL,GFX942-VGPRCD-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-SDAG,GFX942-AGPRCD-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-GISEL,GFX942-AGPRCD-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-SDAG,GFX950-VGPRCD-SDAG %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-GISEL,GFX950-VGPRCD-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-SDAG,GFX950-AGPRCD-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-GISEL,GFX950-AGPRCD-GISEL %s declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32) declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32) @@ -35,26 +31,26 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i3 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32) define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { -; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8: -; GFX942-SDAG: ; %bb.0: ; %bb -; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-SDAG-NEXT: s_endpgm +; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: +; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -77,26 +73,47 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8: -; GFX950-SDAG: ; %bb.0: ; %bb -; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX950-SDAG-NEXT: s_endpgm +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm +; +; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: +; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -118,6 +135,27 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 { ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm +; +; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8: +; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX950-AGPRCD-SDAG-NEXT: s_endpgm bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3) @@ -281,26 +319,26 @@ bb: } define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 { -; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: -; GFX942-SDAG: ; %bb.0: ; %bb -; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-SDAG-NEXT: s_endpgm +; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: +; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -323,26 +361,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: -; GFX950-SDAG: ; %bb.0: ; %bb -; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX950-SDAG-NEXT: s_endpgm +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm +; +; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: +; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -364,6 +423,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm +; +; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8: +; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX950-AGPRCD-SDAG-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3) @@ -372,26 +452,26 @@ bb: } define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 { -; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: -; GFX942-SDAG: ; %bb.0: ; %bb -; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-SDAG-NEXT: s_endpgm +; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: +; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -414,26 +494,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: -; GFX950-SDAG: ; %bb.0: ; %bb -; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX950-SDAG-NEXT: s_endpgm +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm +; +; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: +; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -455,6 +556,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm +; +; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8: +; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX950-AGPRCD-SDAG-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3) @@ -463,26 +585,26 @@ bb: } define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 { -; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: -; GFX942-SDAG: ; %bb.0: ; %bb -; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-SDAG-NEXT: s_endpgm +; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: +; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -505,26 +627,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: -; GFX950-SDAG: ; %bb.0: ; %bb -; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX950-SDAG-NEXT: s_endpgm +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm +; +; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: +; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -546,6 +689,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm +; +; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8: +; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX950-AGPRCD-SDAG-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3) @@ -554,26 +718,26 @@ bb: } define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 { -; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: -; GFX942-SDAG: ; %bb.0: ; %bb -; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX942-SDAG-NEXT: s_nop 1 -; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX942-SDAG-NEXT: s_nop 6 -; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX942-SDAG-NEXT: s_endpgm +; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: +; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX942-GISEL: ; %bb.0: ; %bb @@ -596,26 +760,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) ; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX942-GISEL-NEXT: s_endpgm ; -; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: -; GFX950-SDAG: ; %bb.0: ; %bb -; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0 -; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; GFX950-SDAG-NEXT: s_nop 1 -; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 -; GFX950-SDAG-NEXT: s_nop 7 -; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] -; GFX950-SDAG-NEXT: s_endpgm +; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: +; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX942-AGPRCD-SDAG-NEXT: s_nop 6 +; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX942-AGPRCD-SDAG-NEXT: s_endpgm +; +; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: +; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8: ; GFX950-GISEL: ; %bb.0: ; %bb @@ -637,6 +822,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) ; GFX950-GISEL-NEXT: s_nop 6 ; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GFX950-GISEL-NEXT: s_endpgm +; +; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8: +; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2 +; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 1 +; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 +; GFX950-AGPRCD-SDAG-NEXT: s_nop 7 +; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7] +; GFX950-AGPRCD-SDAG-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3) @@ -1269,20 +1475,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9] +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] ; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1291,18 +1497,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6 +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9] +; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] ; GFX942-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1332,20 +1538,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9] +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] ; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1354,18 +1560,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, < ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6 +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9] +; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] ; GFX950-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16: @@ -1681,20 +1887,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9] +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] ; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -1703,18 +1909,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6 +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9] +; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] ; GFX942-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -1744,20 +1950,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9] +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9] ; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -1766,18 +1972,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg, ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44 -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11] ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13] ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15] ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6 +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9] +; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] ; GFX950-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16: @@ -2093,23 +2299,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8: @@ -2119,21 +2325,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: @@ -2197,23 +2403,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8: @@ -2223,21 +2429,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2 ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8: @@ -2309,16 +2515,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2327,7 +2533,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 @@ -2461,16 +2667,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -2479,7 +2685,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 @@ -2619,23 +2825,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: @@ -2645,21 +2851,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: @@ -2723,23 +2929,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: @@ -2749,21 +2955,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8: @@ -2834,23 +3040,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: @@ -2860,21 +3066,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: @@ -2938,23 +3144,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: @@ -2964,21 +3170,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8: @@ -3049,23 +3255,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: @@ -3075,21 +3281,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: @@ -3153,23 +3359,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: @@ -3179,21 +3385,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8: @@ -3264,23 +3470,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 6 -; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX942-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: @@ -3290,21 +3496,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-VGPRCD-GISEL-NEXT: s_nop 5 -; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX942-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: @@ -3368,23 +3574,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 -; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7] +; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7] ; GFX950-VGPRCD-SDAG-NEXT: s_endpgm ; ; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: @@ -3394,21 +3600,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c ; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44 ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1] ; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2 ; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5] ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7] ; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11] -; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14 +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] +; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 1 -; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2 +; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2 ; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-VGPRCD-GISEL-NEXT: s_nop 6 -; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13] +; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13] ; GFX950-VGPRCD-GISEL-NEXT: s_endpgm ; ; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8: @@ -3480,16 +3686,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3498,7 +3704,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 @@ -3632,16 +3838,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3650,7 +3856,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 @@ -3791,16 +3997,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3809,7 +4015,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 @@ -3943,16 +4149,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -3961,7 +4167,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 @@ -4102,16 +4308,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4120,7 +4326,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 @@ -4254,16 +4460,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4272,7 +4478,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 @@ -4413,16 +4619,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4431,7 +4637,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX942-VGPRCD-SDAG-NEXT: s_nop 1 @@ -4565,16 +4771,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16 ; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20 ; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21 -; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21 +; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7] @@ -4583,7 +4789,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX950-VGPRCD-SDAG-NEXT: s_nop 1 -; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2 +; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2 ; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 7 ; GFX950-VGPRCD-SDAG-NEXT: s_nop 2 |