aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll1238
1 files changed, 722 insertions, 516 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index 92af34f..beda16c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -1,13 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-SDAG,GFX942-VGPRCD-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942,GFX942-VGPRCD,GFX942-GISEL,GFX942-VGPRCD-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-SDAG,GFX942-AGPRCD-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX942,GFX942-AGPRCD,GFX942-GISEL,GFX942-AGPRCD-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-SDAG,GFX950-VGPRCD-SDAG %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck --check-prefixes=GFX950,GFX950-VGPRCD,GFX950-GISEL,GFX950-VGPRCD-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-SDAG,GFX950-AGPRCD-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GFX950,GFX950-AGPRCD,GFX950-GISEL,GFX950-AGPRCD-GISEL %s
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64, i64, <4 x i32>, i32, i32, i32)
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64, i64, <16 x i32>, i32, i32, i32)
@@ -35,26 +31,26 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32>, <4 x i3
declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32>, <4 x i32>, <16 x float>, i32, i32, i32)
define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -77,26 +73,47 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -118,6 +135,27 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 4294967298, i64 12884901892, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -281,26 +319,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -323,26 +361,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -364,6 +423,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -372,26 +452,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -414,26 +494,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -455,6 +556,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -463,26 +585,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -505,26 +627,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -546,6 +689,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -554,26 +718,26 @@ bb:
}
define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg) #0 {
-; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX942-SDAG: ; %bb.0: ; %bb
-; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX942-SDAG-NEXT: s_endpgm
+; GFX942-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
@@ -596,26 +760,47 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
-; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
-; GFX950-SDAG: ; %bb.0: ; %bb
-; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
-; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
-; GFX950-SDAG-NEXT: s_endpgm
+; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX942-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX942-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX942-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-AGPRCD-SDAG-NEXT: s_nop 6
+; GFX942-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX942-AGPRCD-SDAG-NEXT: s_endpgm
+;
+; GFX950-VGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 2
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 4
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-VGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-VGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
@@ -637,6 +822,27 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX950-GISEL-NEXT: s_nop 6
; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
+;
+; GFX950-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
+; GFX950-AGPRCD-SDAG: ; %bb.0: ; %bb
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 2
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, 4
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-AGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX950-AGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
+; GFX950-AGPRCD-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 1
+; GFX950-AGPRCD-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-AGPRCD-SDAG-NEXT: s_nop 7
+; GFX950-AGPRCD-SDAG-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
+; GFX950-AGPRCD-SDAG-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 4294967298, i64 12884901892, <4 x float> %in.1, i32 1, i32 2, i32 3)
@@ -1269,20 +1475,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1291,18 +1497,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1332,20 +1538,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1354,18 +1560,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_f16(ptr addrspace(1) %arg, <
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_f16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_f16:
@@ -1681,20 +1887,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1703,18 +1909,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1744,20 +1950,20 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s6
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v7, s6
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v7 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[8:9]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v6, v[8:11], s[8:9]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -1766,18 +1972,18 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x32_bf16(ptr addrspace(1) %arg,
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s6, s[4:5], 0x44
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[10:11]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s6
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s6
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x32_bf16 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-LABEL: test_smfmac_f32_16x16x32_bf16:
@@ -2093,23 +2299,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2119,21 +2325,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2197,23 +2403,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_16x16x64_i8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2223,21 +2429,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_i32_16x16x64_i8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_i8:
@@ -2309,16 +2515,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2327,7 +2533,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -2461,16 +2667,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -2479,7 +2685,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_i8(ptr addrspace(1) %arg, <2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_i32_32x32x32_i8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -2619,23 +2825,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2645,21 +2851,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2723,23 +2929,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2749,21 +2955,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_bf8:
@@ -2834,23 +3040,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2860,21 +3066,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2938,23 +3144,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -2964,21 +3170,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_bf8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_bf8_fp8:
@@ -3049,23 +3255,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3075,21 +3281,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3153,23 +3359,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3179,21 +3385,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_bf8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_bf8:
@@ -3264,23 +3470,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: s_nop 6
-; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX942-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX942-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX942-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3290,21 +3496,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX942-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX942-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX942-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX942-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX942-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX942-VGPRCD-GISEL-NEXT: s_nop 5
-; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX942-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX942-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX942-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3368,23 +3574,23 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG: ; %bb.0: ; %bb
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, 0
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v8, s8
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v10, s8
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v9, s9
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v0, s10
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s11
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s12
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v2, s10
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s11
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v4, s12
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v3, s13
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v11, s14
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v5, s13
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v1, s14
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[0:1]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v11 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[6:9], v[10:11], v[2:5], v1 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
-; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
+; GFX950-VGPRCD-SDAG-NEXT: global_store_dwordx4 v0, v[6:9], s[6:7]
; GFX950-VGPRCD-SDAG-NEXT: s_endpgm
;
; GFX950-VGPRCD-GISEL-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3394,21 +3600,21 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x64_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
; GFX950-VGPRCD-GISEL-NEXT: s_load_dword s14, s[4:5], 0x44
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX950-VGPRCD-GISEL-NEXT: s_load_dwordx4 s[8:11], s[12:13], 0x0
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s4, s2
; GFX950-VGPRCD-GISEL-NEXT: s_mov_b32 s5, s3
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX950-VGPRCD-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
-; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v10, s14
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v6, s14
; GFX950-VGPRCD-GISEL-NEXT: s_nop 1
-; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[4:7], v[8:9], v[0:3], v10 cbsz:1 abid:2
+; GFX950-VGPRCD-GISEL-NEXT: v_smfmac_f32_16x16x64_fp8_fp8 v[8:11], v[4:5], v[0:3], v6 cbsz:1 abid:2
; GFX950-VGPRCD-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX950-VGPRCD-GISEL-NEXT: s_nop 6
-; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[4:7], s[12:13]
+; GFX950-VGPRCD-GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[12:13]
; GFX950-VGPRCD-GISEL-NEXT: s_endpgm
;
; GFX950-AGPRCD-SDAG-LABEL: test_smfmac_i32_16x16x64_fp8_fp8:
@@ -3480,16 +3686,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3498,7 +3704,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -3632,16 +3838,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3650,7 +3856,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -3791,16 +3997,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3809,7 +4015,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -3943,16 +4149,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -3961,7 +4167,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_bf8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_bf8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -4102,16 +4308,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4120,7 +4326,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -4254,16 +4460,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4272,7 +4478,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_bf8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_bf8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2
@@ -4413,16 +4619,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX942-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX942-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4431,7 +4637,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX942-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX942-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-VGPRCD-SDAG-NEXT: s_nop 7
; GFX942-VGPRCD-SDAG-NEXT: s_nop 1
@@ -4565,16 +4771,16 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x2c
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx2 s[24:25], s[4:5], 0x24
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s16
; GFX950-VGPRCD-SDAG-NEXT: s_load_dwordx16 s[0:15], s[24:25], 0x0
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s17
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s18
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v17, s19
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s20
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v18, s18
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s19
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v20, s20
; GFX950-VGPRCD-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v19, s21
-; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v22, s22
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v21, s21
+; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, s22
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
@@ -4583,7 +4789,7 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x32_fp8_fp8(ptr addrspace(1) %ar
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-VGPRCD-SDAG-NEXT: s_nop 1
-; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[20:21], v[16:19], v22 cbsz:1 abid:2
+; GFX950-VGPRCD-SDAG-NEXT: v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[22:23], v[18:21], v16 cbsz:1 abid:2
; GFX950-VGPRCD-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-VGPRCD-SDAG-NEXT: s_nop 7
; GFX950-VGPRCD-SDAG-NEXT: s_nop 2