aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2025-07-25 19:03:52 +0900
committerMatt Arsenault <arsenm2@gmail.com>2025-09-18 11:06:21 +0900
commit9be83ee93251a08500b826cb87555692b8886dd7 (patch)
tree198e39686db16bd959c8a4cb61c8b7b95a1cdfd3
parent8c778b6dbe34f5db5f28730653d81aabc18430fd (diff)
downloadllvm-users/arsenm/amdgpu/select-vgpr-mfma-by-default.zip
llvm-users/arsenm/amdgpu/select-vgpr-mfma-by-default.tar.gz
llvm-users/arsenm/amdgpu/select-vgpr-mfma-by-default.tar.bz2
AMDGPU: Select VGPR MFMAs by defaultusers/arsenm/amdgpu/select-vgpr-mfma-by-default
AGPRs are undesirable since they are only usable by a handful instructions like loads, stores and mfmas and everything else requires copies to/from VGPRs. Using the AGPR form should be a measure of last resort if we must use more than 256 VGPRs.
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp2
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll300
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/acc-ldst.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll248
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll152
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll962
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll1280
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll731
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll886
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll47
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll2121
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll1218
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll6719
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll1105
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll2933
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-loop.ll2106
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll987
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-agpr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll104
28 files changed, 11169 insertions, 10914 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 908d856..0077c69 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -37,7 +37,7 @@ static cl::opt<bool> MFMAVGPRForm(
"amdgpu-mfma-vgpr-form", cl::Hidden,
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
"unspecified, default to compiler heuristics"),
- cl::init(false));
+ cl::init(true));
const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 5720b88..2493065 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -15,59 +15,42 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GCN-NEXT: s_mov_b64 s[36:37], 1
-; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
-; GCN-NEXT: s_mov_b32 s38, 2
-; GCN-NEXT: s_mov_b32 s39, s37
+; GCN-NEXT: v_pk_mov_b32 v[32:33], s[36:37], s[36:37] op_sel:[0,1]
+; GCN-NEXT: s_mov_b32 s36, 2
+; GCN-NEXT: v_pk_mov_b32 v[34:35], s[36:37], s[36:37] op_sel:[0,1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a16, s16
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
-; GCN-NEXT: v_accvgpr_write_b32 a4, s4
-; GCN-NEXT: v_accvgpr_write_b32 a5, s5
-; GCN-NEXT: v_accvgpr_write_b32 a6, s6
-; GCN-NEXT: v_accvgpr_write_b32 a7, s7
-; GCN-NEXT: v_accvgpr_write_b32 a8, s8
-; GCN-NEXT: v_accvgpr_write_b32 a9, s9
-; GCN-NEXT: v_accvgpr_write_b32 a10, s10
-; GCN-NEXT: v_accvgpr_write_b32 a11, s11
-; GCN-NEXT: v_accvgpr_write_b32 a12, s12
-; GCN-NEXT: v_accvgpr_write_b32 a13, s13
-; GCN-NEXT: v_accvgpr_write_b32 a14, s14
-; GCN-NEXT: v_accvgpr_write_b32 a15, s15
-; GCN-NEXT: v_accvgpr_write_b32 a17, s17
-; GCN-NEXT: v_accvgpr_write_b32 a18, s18
-; GCN-NEXT: v_accvgpr_write_b32 a19, s19
-; GCN-NEXT: v_accvgpr_write_b32 a20, s20
-; GCN-NEXT: v_accvgpr_write_b32 a21, s21
-; GCN-NEXT: v_accvgpr_write_b32 a22, s22
-; GCN-NEXT: v_accvgpr_write_b32 a23, s23
-; GCN-NEXT: v_accvgpr_write_b32 a24, s24
-; GCN-NEXT: v_accvgpr_write_b32 a25, s25
-; GCN-NEXT: v_accvgpr_write_b32 a26, s26
-; GCN-NEXT: v_accvgpr_write_b32 a27, s27
-; GCN-NEXT: v_accvgpr_write_b32 a28, s28
-; GCN-NEXT: v_accvgpr_write_b32 a29, s29
-; GCN-NEXT: v_accvgpr_write_b32 a30, s30
-; GCN-NEXT: v_accvgpr_write_b32 a31, s31
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[16:17], s[16:17], s[16:17] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[20:21], s[20:21], s[20:21] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[22:23], s[22:23], s[22:23] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[24:25], s[24:25], s[24:25] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[26:27], s[26:27], s[26:27] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[28:29], s[28:29], s[28:29] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[30:31], s[30:31], s[30:31] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[0:1], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[32:33], v[34:35], v[0:31] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
-; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GCN-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GCN-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GCN-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GCN-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
; GCN-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -83,36 +66,28 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GCN-NEXT: s_mov_b64 s[18:19], 1
-; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[16:17], s[18:19], s[18:19] op_sel:[0,1]
; GCN-NEXT: s_mov_b32 s18, 2
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
-; GCN-NEXT: v_accvgpr_write_b32 a4, s4
-; GCN-NEXT: v_accvgpr_write_b32 a5, s5
-; GCN-NEXT: v_accvgpr_write_b32 a6, s6
-; GCN-NEXT: v_accvgpr_write_b32 a7, s7
-; GCN-NEXT: v_accvgpr_write_b32 a8, s8
-; GCN-NEXT: v_accvgpr_write_b32 a9, s9
-; GCN-NEXT: v_accvgpr_write_b32 a10, s10
-; GCN-NEXT: v_accvgpr_write_b32 a11, s11
-; GCN-NEXT: v_accvgpr_write_b32 a12, s12
-; GCN-NEXT: v_accvgpr_write_b32 a13, s13
-; GCN-NEXT: v_accvgpr_write_b32 a14, s14
-; GCN-NEXT: v_accvgpr_write_b32 a15, s15
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 9
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GCN-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -128,21 +103,19 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: s_mov_b64 s[4:5], 1
-; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GCN-NEXT: s_mov_b32 s4, 2
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_nop 3
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -158,37 +131,29 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GCN-NEXT: s_mov_b64 s[18:19], 1
-; GCN-NEXT: v_pk_mov_b32 v[0:1], s[18:19], s[18:19] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[16:17], s[18:19], s[18:19] op_sel:[0,1]
; GCN-NEXT: s_mov_b32 s18, 2
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[18:19], s[18:19] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
-; GCN-NEXT: v_accvgpr_write_b32 a4, s4
-; GCN-NEXT: v_accvgpr_write_b32 a5, s5
-; GCN-NEXT: v_accvgpr_write_b32 a6, s6
-; GCN-NEXT: v_accvgpr_write_b32 a7, s7
-; GCN-NEXT: v_accvgpr_write_b32 a8, s8
-; GCN-NEXT: v_accvgpr_write_b32 a9, s9
-; GCN-NEXT: v_accvgpr_write_b32 a10, s10
-; GCN-NEXT: v_accvgpr_write_b32 a11, s11
-; GCN-NEXT: v_accvgpr_write_b32 a12, s12
-; GCN-NEXT: v_accvgpr_write_b32 a13, s13
-; GCN-NEXT: v_accvgpr_write_b32 a14, s14
-; GCN-NEXT: v_accvgpr_write_b32 a15, s15
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v16, 0
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 1
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GCN-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -204,21 +169,19 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GCN-NEXT: s_mov_b64 s[4:5], 1
-; GCN-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
; GCN-NEXT: s_mov_b32 s4, 2
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[6:7], s[4:5], s[4:5] op_sel:[0,1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v4, 0
; GCN-NEXT: s_nop 9
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -238,12 +201,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
+; GCN-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GCN-NEXT: s_endpgm
bb:
%mai.1 = tail call double @llvm.amdgcn.mfma.f64.4x4x4f64(double %a, double %b, double 0.0, i32 0, i32 0, i32 0)
@@ -258,25 +221,21 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1]
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
-; GCN-NEXT: v_accvgpr_write_b32 a4, s4
-; GCN-NEXT: v_accvgpr_write_b32 a5, s5
-; GCN-NEXT: v_accvgpr_write_b32 a6, s6
-; GCN-NEXT: v_accvgpr_write_b32 a7, s7
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
+; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
; GCN-NEXT: s_endpgm
bb:
%in.1 = load <4 x double>, ptr addrspace(1) %arg
@@ -291,16 +250,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
-; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0
+; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
; GCN-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, i32 0, i32 0, i32 0)
@@ -312,28 +271,26 @@ bb:
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x34
+; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GCN-NEXT: s_mov_b64 s[0:1], 0
; GCN-NEXT: s_mov_b64 s[6:7], 1.0
-; GCN-NEXT: s_mov_b64 s[8:9], 0
-; GCN-NEXT: v_accvgpr_write_b32 a0, s8
+; GCN-NEXT: s_mov_b64 s[2:3], s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GCN-NEXT: v_accvgpr_write_b32 a2, s8
-; GCN-NEXT: v_accvgpr_write_b32 a4, s8
-; GCN-NEXT: v_accvgpr_write_b32 a6, s6
-; GCN-NEXT: v_accvgpr_write_b32 a1, s9
-; GCN-NEXT: v_accvgpr_write_b32 a3, s9
-; GCN-NEXT: v_accvgpr_write_b32 a5, s9
-; GCN-NEXT: v_accvgpr_write_b32 a7, s7
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
+; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
; GCN-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
@@ -344,28 +301,27 @@ bb:
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 {
; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
-; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: s_mov_b32 s7, 0x405ec000
-; GCN-NEXT: v_accvgpr_write_b32 a0, s6
+; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
+; GCN-NEXT: s_mov_b32 s0, 0
+; GCN-NEXT: s_mov_b32 s1, 0x405ec000
+; GCN-NEXT: s_mov_b64 s[2:3], s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GCN-NEXT: v_accvgpr_write_b32 a2, s6
-; GCN-NEXT: v_accvgpr_write_b32 a4, s6
-; GCN-NEXT: v_accvgpr_write_b32 a6, s6
-; GCN-NEXT: v_accvgpr_write_b32 a1, s7
-; GCN-NEXT: v_accvgpr_write_b32 a3, s7
-; GCN-NEXT: v_accvgpr_write_b32 a5, s7
-; GCN-NEXT: v_accvgpr_write_b32 a7, s7
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[8:9], s[8:9] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GCN-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 0
-; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
+; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
; GCN-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir
index 67ed51a3..5f040779 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx90a.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
-# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-fast -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=regbankselect -regbankselect-greedy -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
---
name: mfma_f32_32x32x4bf16_1k_vva
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir
index e11586e..12208c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.gfx942.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
-# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-fast -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=FAST
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -run-pass=regbankselect -regbankselect-greedy -amdgpu-mfma-vgpr-form=0 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GREEDY
---
name: mfma_i32_16x16x32_i8_vva
diff --git a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
index 635d2a2..4258d1d 100644
--- a/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
+++ b/llvm/test/CodeGen/AMDGPU/acc-ldst.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 -early-live-intervals < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 9e24023..f2f41f4 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-mfma-vgpr-form=0 -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
; This testcase would fail on GFX908 due to not having a free VGPR available to
; copy between AGPRs.
diff --git a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
index fe8edd5..7644d89 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx90a-enc.ll
@@ -1,8 +1,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
-; Make sure flag is ignored
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-vgpr-form=1 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
+; Make sure flag is ignored for gfx908
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-vgpr-form=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX9,GFX90A %s
; GFX9-DAG: buffer_load_format_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:
; GFX9-DAG: buffer_load_format_d16_xyzw v[{{[0-9:]+}}], v{{[0-9]+}}, s[{{[0-9:]+}}], 0 idxen ; encoding:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
index 80f295b..51bcb39 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.simple.ll
@@ -6,146 +6,144 @@ define amdgpu_kernel void @MFMAExpInterleave(ptr addrspace(1) %out0, ptr addrspa
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s6, s[4:5], 0x10
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GCN-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b
-; GCN-NEXT: v_mov_b32_e32 v0, 1.0
+; GCN-NEXT: v_mov_b32_e32 v5, 0x3fb8aa3b
+; GCN-NEXT: v_mov_b32_e32 v4, 1.0
; GCN-NEXT: s_mov_b32 s7, 0x42b17218
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v2, s6, v1
-; GCN-NEXT: v_rndne_f32_e32 v3, v2
-; GCN-NEXT: v_sub_f32_e32 v4, v2, v3
-; GCN-NEXT: v_fma_f32 v1, s6, v1, -v2
-; GCN-NEXT: v_mov_b32_e32 v2, 0x32a5705f
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_fmac_f32_e32 v1, s6, v2
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
-; GCN-NEXT: v_add_f32_e32 v1, v4, v1
-; GCN-NEXT: v_cvt_i32_f32_e32 v2, v3
-; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
-; GCN-NEXT: v_exp_f32_e32 v1, v1
+; GCN-NEXT: v_mul_f32_e32 v6, s6, v5
+; GCN-NEXT: v_rndne_f32_e32 v7, v6
+; GCN-NEXT: v_sub_f32_e32 v8, v6, v7
+; GCN-NEXT: v_fma_f32 v5, s6, v5, -v6
+; GCN-NEXT: v_mov_b32_e32 v6, 0x32a5705f
+; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GCN-NEXT: v_fmac_f32_e32 v5, s6, v6
+; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GCN-NEXT: v_add_f32_e32 v5, v8, v5
+; GCN-NEXT: v_cvt_i32_f32_e32 v6, v7
+; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
+; GCN-NEXT: v_exp_f32_e32 v5, v5
; GCN-NEXT: s_mov_b32 s0, 0x3fb8aa3b
-; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
+; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
; GCN-NEXT: ; iglp_opt mask(0x00000003)
-; GCN-NEXT: v_ldexp_f32 v1, v1, v2
-; GCN-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v2
-; GCN-NEXT: v_mov_b32_e32 v2, 0x42b17218
+; GCN-NEXT: v_ldexp_f32 v5, v5, v6
+; GCN-NEXT: v_mov_b32_e32 v6, 0xc2ce8ed0
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v6
+; GCN-NEXT: v_mov_b32_e32 v6, 0x42b17218
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v2
-; GCN-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6
+; GCN-NEXT: v_mov_b32_e32 v6, 0x7f800000
; GCN-NEXT: s_mov_b32 s6, 0xc2ce8ed0
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
-; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
-; GCN-NEXT: v_rndne_f32_e32 v5, v3
-; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
-; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
-; GCN-NEXT: v_add_f32_e32 v3, v3, v4
-; GCN-NEXT: v_exp_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
-; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
-; GCN-NEXT: v_ldexp_f32 v3, v3, v4
-; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
+; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
+; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
+; GCN-NEXT: v_rndne_f32_e32 v9, v7
+; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
+; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
+; GCN-NEXT: v_add_f32_e32 v7, v7, v8
+; GCN-NEXT: v_exp_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
+; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
+; GCN-NEXT: v_ldexp_f32 v7, v7, v8
+; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
-; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
-; GCN-NEXT: v_rndne_f32_e32 v5, v3
-; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
-; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
-; GCN-NEXT: v_add_f32_e32 v3, v3, v4
-; GCN-NEXT: v_exp_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
-; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
-; GCN-NEXT: v_ldexp_f32 v3, v3, v4
-; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
+; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
+; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
+; GCN-NEXT: v_rndne_f32_e32 v9, v7
+; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
+; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
+; GCN-NEXT: v_add_f32_e32 v7, v7, v8
+; GCN-NEXT: v_exp_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
+; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
+; GCN-NEXT: v_ldexp_f32 v7, v7, v8
+; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
-; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
-; GCN-NEXT: v_rndne_f32_e32 v5, v3
-; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
-; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
-; GCN-NEXT: v_add_f32_e32 v3, v3, v4
-; GCN-NEXT: v_exp_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
-; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
-; GCN-NEXT: v_ldexp_f32 v3, v3, v4
-; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
+; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
+; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
+; GCN-NEXT: v_rndne_f32_e32 v9, v7
+; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
+; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
+; GCN-NEXT: v_add_f32_e32 v7, v7, v8
+; GCN-NEXT: v_exp_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
+; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
+; GCN-NEXT: v_ldexp_f32 v7, v7, v8
+; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
-; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
-; GCN-NEXT: v_rndne_f32_e32 v5, v3
-; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
-; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
-; GCN-NEXT: v_add_f32_e32 v3, v3, v4
-; GCN-NEXT: v_exp_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
-; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
-; GCN-NEXT: v_ldexp_f32 v3, v3, v4
-; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
+; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
+; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
+; GCN-NEXT: v_rndne_f32_e32 v9, v7
+; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
+; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
+; GCN-NEXT: v_add_f32_e32 v7, v7, v8
+; GCN-NEXT: v_exp_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
+; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
+; GCN-NEXT: v_ldexp_f32 v7, v7, v8
+; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
-; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
-; GCN-NEXT: v_rndne_f32_e32 v5, v3
-; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
-; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
-; GCN-NEXT: v_add_f32_e32 v3, v3, v4
-; GCN-NEXT: v_exp_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
-; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
-; GCN-NEXT: v_ldexp_f32 v3, v3, v4
-; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
+; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
+; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
+; GCN-NEXT: v_rndne_f32_e32 v9, v7
+; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
+; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
+; GCN-NEXT: v_add_f32_e32 v7, v7, v8
+; GCN-NEXT: v_exp_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
+; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
+; GCN-NEXT: v_ldexp_f32 v7, v7, v8
+; GCN-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v1
-; GCN-NEXT: v_fma_f32 v4, v1, s0, -v3
-; GCN-NEXT: v_rndne_f32_e32 v5, v3
-; GCN-NEXT: v_fmac_f32_e32 v4, 0x32a5705f, v1
-; GCN-NEXT: v_sub_f32_e32 v3, v3, v5
-; GCN-NEXT: v_add_f32_e32 v3, v3, v4
-; GCN-NEXT: v_exp_f32_e32 v3, v3
-; GCN-NEXT: v_cvt_i32_f32_e32 v4, v5
-; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v0, a[0:3]
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v1
-; GCN-NEXT: v_ldexp_f32 v0, v3, v4
+; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v5
+; GCN-NEXT: v_fma_f32 v8, v5, s0, -v7
+; GCN-NEXT: v_rndne_f32_e32 v9, v7
+; GCN-NEXT: v_fmac_f32_e32 v8, 0x32a5705f, v5
+; GCN-NEXT: v_sub_f32_e32 v7, v7, v9
+; GCN-NEXT: v_add_f32_e32 v7, v7, v8
+; GCN-NEXT: v_exp_f32_e32 v7, v7
+; GCN-NEXT: v_cvt_i32_f32_e32 v8, v9
+; GCN-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v4, v[0:3]
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
+; GCN-NEXT: v_ldexp_f32 v4, v7, v8
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v1
+; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GCN-NEXT: v_fma_f32 v3, v0, s0, -v1
-; GCN-NEXT: v_rndne_f32_e32 v4, v1
-; GCN-NEXT: v_fmac_f32_e32 v3, 0x32a5705f, v0
-; GCN-NEXT: v_sub_f32_e32 v1, v1, v4
-; GCN-NEXT: v_add_f32_e32 v1, v1, v3
-; GCN-NEXT: v_exp_f32_e32 v1, v1
-; GCN-NEXT: v_cvt_i32_f32_e32 v3, v4
+; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; GCN-NEXT: v_mul_f32_e32 v5, 0x3fb8aa3b, v4
+; GCN-NEXT: v_fma_f32 v7, v4, s0, -v5
+; GCN-NEXT: v_rndne_f32_e32 v8, v5
+; GCN-NEXT: v_fmac_f32_e32 v7, 0x32a5705f, v4
+; GCN-NEXT: v_sub_f32_e32 v5, v5, v8
+; GCN-NEXT: v_add_f32_e32 v5, v5, v7
+; GCN-NEXT: v_exp_f32_e32 v5, v5
+; GCN-NEXT: v_cvt_i32_f32_e32 v7, v8
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v0
-; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_ldexp_f32 v1, v1, v3
-; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v0
+; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v4
+; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_ldexp_f32 v5, v5, v7
+; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_dwordx4 v4, a[0:3], s[0:1]
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: global_store_dword v4, v0, s[2:3]
+; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; GCN-NEXT: global_store_dword v8, v4, s[2:3]
; GCN-NEXT: s_endpgm
%mai0 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in1, i32 0, i32 0, i32 0)
%mai1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai0, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
index 7959cee..b2931ad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -check-prefix=GCN %s
define amdgpu_kernel void @test_iglp_opt() #0 {
; GCN-LABEL: test_iglp_opt:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
index 12a998a..f0040b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefix=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck --check-prefix=GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GFX90A-VGPR %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x2bf16(<2 x i16>, <2 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x2bf16(<2 x i16>, <2 x i16>, <16 x float>, i32, i32, i32)
@@ -201,6 +202,62 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x2bf16:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v33, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v34, 2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v32, 0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v0, s16
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v1, s17
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v3, s19
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, s20
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, s21
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, s22
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v7, s23
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v8, s24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v9, s25
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v10, s26
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v11, s27
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v12, s28
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v13, s29
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v14, s30
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v15, s31
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, s0
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v18, s2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v19, s3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v20, s4
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v21, s5
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v22, s6
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v23, s7
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v28, s12
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v29, s13
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v30, s14
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v31, s15
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x2bf16 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 15
+; GFX90A-VGPR-NEXT: s_nop 2
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GFX90A-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -311,6 +368,32 @@ define amdgpu_kernel void @test_mfma_f32_16x16x2bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x2bf16:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x2bf16 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-VGPR-NEXT: s_nop 9
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -367,6 +450,23 @@ define amdgpu_kernel void @test_mfma_f32_4x4x2bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 4
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x2bf16:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_4x4x2bf16 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 4
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -478,6 +578,33 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_32x32x4bf16 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-VGPR-NEXT: s_nop 15
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -534,6 +661,23 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8bf16(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x8bf16:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_f32_16x16x8bf16 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 10
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%a = bitcast i32 1 to <2 x i16>
@@ -544,5 +688,3 @@ bb:
}
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 5ab8706..3236864 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -17,115 +17,115 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-LABEL: test_mfma_f32_32x32x4bf16_1k:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: v_mov_b32_e32 v33, 0
+; GFX90A-NEXT: v_mov_b32_e32 v34, 1
+; GFX90A-NEXT: v_mov_b32_e32 v35, v33
+; GFX90A-NEXT: v_mov_b32_e32 v32, 2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX90A-NEXT: v_mov_b32_e32 v0, s16
+; GFX90A-NEXT: v_mov_b32_e32 v1, s17
+; GFX90A-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-NEXT: v_mov_b32_e32 v3, s19
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
+; GFX90A-NEXT: v_mov_b32_e32 v5, s21
+; GFX90A-NEXT: v_mov_b32_e32 v6, s22
+; GFX90A-NEXT: v_mov_b32_e32 v7, s23
+; GFX90A-NEXT: v_mov_b32_e32 v8, s24
+; GFX90A-NEXT: v_mov_b32_e32 v9, s25
+; GFX90A-NEXT: v_mov_b32_e32 v10, s26
+; GFX90A-NEXT: v_mov_b32_e32 v11, s27
+; GFX90A-NEXT: v_mov_b32_e32 v12, s28
+; GFX90A-NEXT: v_mov_b32_e32 v13, s29
+; GFX90A-NEXT: v_mov_b32_e32 v14, s30
+; GFX90A-NEXT: v_mov_b32_e32 v15, s31
+; GFX90A-NEXT: v_mov_b32_e32 v16, s0
+; GFX90A-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-NEXT: v_mov_b32_e32 v18, s2
+; GFX90A-NEXT: v_mov_b32_e32 v19, s3
+; GFX90A-NEXT: v_mov_b32_e32 v20, s4
+; GFX90A-NEXT: v_mov_b32_e32 v21, s5
+; GFX90A-NEXT: v_mov_b32_e32 v22, s6
+; GFX90A-NEXT: v_mov_b32_e32 v23, s7
+; GFX90A-NEXT: v_mov_b32_e32 v24, s8
+; GFX90A-NEXT: v_mov_b32_e32 v25, s9
+; GFX90A-NEXT: v_mov_b32_e32 v26, s10
+; GFX90A-NEXT: v_mov_b32_e32 v27, s11
+; GFX90A-NEXT: v_mov_b32_e32 v28, s12
+; GFX90A-NEXT: v_mov_b32_e32 v29, s13
+; GFX90A-NEXT: v_mov_b32_e32 v30, s14
+; GFX90A-NEXT: v_mov_b32_e32 v31, s15
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x4bf16_1k a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_32x32x4bf16_1k v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v1, a[16:19], s[34:35] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v1, a[20:23], s[34:35] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[34:35] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[34:35] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[34:35]
-; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[34:35] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX90A-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x4bf16_1k:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, 1
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: v_mov_b32_e32 v33, 0
+; GFX942-NEXT: v_mov_b32_e32 v34, 1
+; GFX942-NEXT: v_mov_b32_e32 v35, v33
+; GFX942-NEXT: v_mov_b32_e32 v32, 2
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX942-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-NEXT: v_mov_b32_e32 v31, s15
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_bf16 v[0:31], v[34:35], v[32:33], v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v1, a[24:27], s[34:35] offset:96
-; GFX942-NEXT: global_store_dwordx4 v1, a[28:31], s[34:35] offset:112
-; GFX942-NEXT: global_store_dwordx4 v1, a[16:19], s[34:35] offset:64
-; GFX942-NEXT: global_store_dwordx4 v1, a[20:23], s[34:35] offset:80
-; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[34:35] offset:32
-; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[34:35] offset:48
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[34:35]
-; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[34:35] offset:16
+; GFX942-NEXT: global_store_dwordx4 v33, v[24:27], s[34:35] offset:96
+; GFX942-NEXT: global_store_dwordx4 v33, v[28:31], s[34:35] offset:112
+; GFX942-NEXT: global_store_dwordx4 v33, v[16:19], s[34:35] offset:64
+; GFX942-NEXT: global_store_dwordx4 v33, v[20:23], s[34:35] offset:80
+; GFX942-NEXT: global_store_dwordx4 v33, v[8:11], s[34:35] offset:32
+; GFX942-NEXT: global_store_dwordx4 v33, v[12:15], s[34:35] offset:48
+; GFX942-NEXT: global_store_dwordx4 v33, v[0:3], s[34:35]
+; GFX942-NEXT: global_store_dwordx4 v33, v[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x4bf16_1k:
@@ -254,71 +254,55 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-LABEL: test_mfma_f32_16x16x4bf16_1k:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-NEXT: v_mov_b32_e32 v18, 1
+; GFX90A-NEXT: v_mov_b32_e32 v19, v17
+; GFX90A-NEXT: v_mov_b32_e32 v16, 2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x4bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_16x16x4bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 10
-; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x4bf16_1k:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, 1
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 1
+; GFX942-NEXT: v_mov_b32_e32 v19, v17
+; GFX942-NEXT: v_mov_b32_e32 v16, 2
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_16x16x4_4b_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 10
-; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX942-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x4bf16_1k:
@@ -387,41 +371,37 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4bf16_1k(ptr addrspace(1) %arg) #0
; GFX90A-LABEL: test_mfma_f32_4x4x4bf16_1k:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v4, 2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_4x4x4bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_4x4x4bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x4bf16_1k:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, 1
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 1
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v4, 2
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_4x4x4_16b_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_4x4x4_16b_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 4
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f32_4x4x4bf16_1k:
@@ -472,72 +452,56 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8bf16_1k(ptr addrspace(1) %arg) #
; GFX90A-LABEL: test_mfma_f32_32x32x8bf16_1k:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-NEXT: v_mov_b32_e32 v18, 1
+; GFX90A-NEXT: v_mov_b32_e32 v19, v17
+; GFX90A-NEXT: v_mov_b32_e32 v16, 2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x8bf16_1k a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_32x32x8bf16_1k v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x8bf16_1k:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, 1
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 1
+; GFX942-NEXT: v_mov_b32_e32 v19, v17
+; GFX942-NEXT: v_mov_b32_e32 v16, 2
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x8_bf16 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 10
-; GFX942-NEXT: global_store_dwordx4 v1, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v1, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v1, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[16:17]
+; GFX942-NEXT: global_store_dwordx4 v17, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v17, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v17, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v17, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f32_32x32x8bf16_1k:
@@ -607,41 +571,37 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16bf16_1k(ptr addrspace(1) %arg)
; GFX90A-LABEL: test_mfma_f32_16x16x16bf16_1k:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 1
-; GFX90A-NEXT: v_mov_b32_e32 v3, v1
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_mov_b32_e32 v4, 2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x16bf16_1k a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_16x16x16bf16_1k v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 10
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x16bf16_1k:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
-; GFX942-NEXT: v_mov_b32_e32 v2, 1
-; GFX942-NEXT: v_mov_b32_e32 v3, v1
-; GFX942-NEXT: v_mov_b32_e32 v0, 2
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 1
+; GFX942-NEXT: v_mov_b32_e32 v7, v5
+; GFX942-NEXT: v_mov_b32_e32 v4, 2
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x16_bf16 a[0:3], v[2:3], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_16x16x16_bf16 v[0:3], v[6:7], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f32_16x16x16bf16_1k:
@@ -697,12 +657,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], 0
+; GFX90A-NEXT: v_mfma_f64_4x4x4f64 v[4:5], v[0:1], v[2:3], 0
; GFX90A-NEXT: s_nop 3
-; GFX90A-NEXT: v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_nop 7
-; GFX90A-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1]
+; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_4x4x4f64:
@@ -713,12 +673,12 @@ define amdgpu_kernel void @test_mfma_f64_4x4x4f64(ptr addrspace(1) %arg, double
; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], 0
+; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 v[4:5], v[0:1], v[2:3], 0
; GFX942-NEXT: s_nop 3
-; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], v[2:3], a[0:1] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[4:5] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: global_store_dwordx2 v0, a[0:1], s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_4x4x4f64:
@@ -765,26 +725,22 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
+; GFX90A-NEXT: v_mov_b32_e32 v10, s10
; GFX90A-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s11
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v11, s11
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64:
@@ -792,26 +748,22 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-NEXT: v_mov_b32_e32 v10, s10
; GFX942-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
-; GFX942-NEXT: v_mov_b32_e32 v3, s11
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-NEXT: v_mov_b32_e32 v11, s11
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[10:11], v[8:9], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[8:9] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[8:9]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64:
@@ -872,16 +824,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 0
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
@@ -889,16 +841,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_0(ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 0
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_0:
@@ -947,16 +899,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
@@ -964,16 +916,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_neg1(ptr addrs
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_neg1:
@@ -1022,16 +974,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 1.0
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
@@ -1039,16 +991,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_1(ptr addrspace(1)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 1.0
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 1.0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_1:
@@ -1097,16 +1049,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], -1.0
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
@@ -1114,16 +1066,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_neg1(ptr addrspace
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], -1.0
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], -1.0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_neg1:
@@ -1172,16 +1124,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], 64
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], 64
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
@@ -1189,16 +1141,16 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64(ptr addrspa
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], 64
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], 64
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64:
@@ -1246,50 +1198,56 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bit
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 64
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 64
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 15
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 64
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 64
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[10:11], v[12:13], v[2:9] cbsz:1 abid:2 neg:[1,1,0]
; GFX942-NEXT: s_nop 15
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_bits:
@@ -1359,50 +1317,50 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 64
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 64
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 64
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_mov_b32_e32 v0, 64
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_int_64_in_high_and_low:
@@ -1466,50 +1424,50 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 neg:[1,1,0]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7]
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[0:7], v[8:9], v[10:11], v[0:7] cbsz:1 abid:2 neg:[1,1,0]
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_imm_f32_1_in_high_and_low:
@@ -1573,52 +1531,56 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x3ff00000
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0x3ff00000
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v12, s2
+; GFX90A-NEXT: v_mov_b32_e32 v13, s3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX90A-NEXT: s_nop 15
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_imm:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x3ff00000
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0x3ff00000
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s2
-; GFX942-NEXT: v_mov_b32_e32 v3, s3
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v12, s2
+; GFX942-NEXT: v_mov_b32_e32 v13, s3
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX942-NEXT: s_nop 15
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_imm:
@@ -1687,52 +1649,56 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x405ec000
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x405ec000
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v12, s2
+; GFX90A-NEXT: v_mov_b32_e32 v13, s3
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[2:3], v[0:1], a[0:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f64_16x16x4f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX90A-NEXT: s_nop 15
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x405ec000
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x405ec000
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s2
-; GFX942-NEXT: v_mov_b32_e32 v3, s3
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: v_mov_b32_e32 v12, s2
+; GFX942-NEXT: v_mov_b32_e32 v13, s3
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f64_16x16x4_f64 a[0:7], v[2:3], v[0:1], a[0:7]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f64_16x16x4_f64 v[2:9], v[12:13], v[10:11], v[2:9]
; GFX942-NEXT: s_nop 15
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX90A-VGPR-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
index dc4c9291..477c74c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll
@@ -34,85 +34,77 @@ define amdgpu_kernel void @test_mfma_i32_16x16x32i8(ptr addrspace(1) %arg) #0 {
; GFX942-SDAG-LABEL: test_mfma_i32_16x16x32i8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_i32_16x16x32i8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_i32_16x16x32i8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_i32_16x16x32_i8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_i32_16x16x32i8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -165,145 +157,113 @@ define amdgpu_kernel void @test_mfma_i32_32x32x16i8(ptr addrspace(1) %arg) #0 {
; GFX942-SDAG-LABEL: test_mfma_i32_32x32x16i8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_i32_32x32x16i8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_i32_32x32x16i8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_nop 10
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_i32_32x32x16i8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_i32_32x32x16_i8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_i32_32x32x16_i8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
@@ -316,85 +276,77 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -447,85 +399,77 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_bf8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_bf8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -578,85 +522,77 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_bf8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_bf8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -709,85 +645,77 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX950-SDAG-NEXT: s_nop 7
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX950-GISEL-NEXT: s_nop 6
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX950-GISEL-NEXT: s_endpgm
; GFX942-AGPRCD-SDAG-LABEL: test_mfma_f32_16x16x32_fp8_fp8:
; GFX942-AGPRCD-SDAG: ; %bb.0: ; %bb
@@ -840,145 +768,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_bf8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_nop 10
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -991,145 +887,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf8_fp8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_nop 10
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_bf8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_bf8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -1142,145 +1006,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_bf8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_nop 10
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_bf8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_bf8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -1293,145 +1125,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_fp8_fp8(ptr addrspace(1) %arg)
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
; GFX942-GISEL: ; %bb.0: ; %bb
; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
;
; GFX950-SDAG-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
; GFX950-SDAG: ; %bb.0: ; %bb
; GFX950-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, 3
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-SDAG-NEXT: s_nop 1
-; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-SDAG-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX950-SDAG-NEXT: s_nop 10
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX950-SDAG-NEXT: s_endpgm
;
; GFX950-GISEL-LABEL: test_mfma_f32_32x32x16_fp8_fp8:
; GFX950-GISEL: ; %bb.0: ; %bb
; GFX950-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 2
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, 1
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 4
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, 3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, 3
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX950-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX950-GISEL-NEXT: s_nop 1
-; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-GISEL-NEXT: v_mfma_f32_32x32x16_fp8_fp8 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX950-GISEL-NEXT: s_nop 10
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX950-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX950-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX950-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
index 033a35f..951763b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefix=AGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -enable-var-scope --check-prefix=VGPR %s
; FIXME: bfloat vector arguments are broken in globalisel.
; https://github.com/llvm/llvm-project/issues/77055
@@ -77,6 +78,133 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_bf16:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b64_e32 v[8:9], 48
+; AGPR-NEXT: v_mov_b64_e32 v[10:11], 32
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], 16
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: v_mov_b32_e32 v16, s16
+; AGPR-NEXT: v_mov_b32_e32 v17, s17
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT: v_mov_b32_e32 v18, s18
+; AGPR-NEXT: v_mov_b32_e32 v19, s19
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], 0
+; AGPR-NEXT: s_nop 4
+; AGPR-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_bf16:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPR-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPR-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: v_mov_b32_e32 v48, s16
+; VGPR-NEXT: v_mov_b32_e32 v49, s17
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[16:31], v[32:35], v[36:39], v[0:15]
+; VGPR-NEXT: v_mov_b32_e32 v50, s18
+; VGPR-NEXT: v_mov_b32_e32 v51, s19
+; VGPR-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v0, s20
+; VGPR-NEXT: v_mov_b32_e32 v1, s21
+; VGPR-NEXT: v_mov_b32_e32 v2, s22
+; VGPR-NEXT: v_mov_b32_e32 v3, s23
+; VGPR-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s8
+; VGPR-NEXT: v_mov_b32_e32 v1, s9
+; VGPR-NEXT: v_mov_b32_e32 v2, s10
+; VGPR-NEXT: v_mov_b32_e32 v3, s11
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s12
+; VGPR-NEXT: v_mov_b32_e32 v1, s13
+; VGPR-NEXT: v_mov_b32_e32 v2, s14
+; VGPR-NEXT: v_mov_b32_e32 v3, s15
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
store volatile <16 x float> %result, ptr addrspace(1) null
store volatile <16 x float> %arg2, ptr addrspace(1) null
@@ -150,6 +278,133 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: v_mov_b64_e32 v[8:9], 48
+; AGPR-NEXT: v_mov_b64_e32 v[10:11], 32
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], 16
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
+; AGPR-NEXT: v_accvgpr_write_b32 a0, s8
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
+; AGPR-NEXT: v_accvgpr_write_b32 a1, s9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, s10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, s11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, s12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, s13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, s14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, s15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, s16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, s17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, s18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, s19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, s20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, s21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, s22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, s23
+; AGPR-NEXT: v_mov_b32_e32 v16, s16
+; AGPR-NEXT: v_mov_b32_e32 v17, s17
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
+; AGPR-NEXT: v_mov_b32_e32 v18, s18
+; AGPR-NEXT: v_mov_b32_e32 v19, s19
+; AGPR-NEXT: v_mov_b32_e32 v0, s20
+; AGPR-NEXT: v_mov_b32_e32 v1, s21
+; AGPR-NEXT: v_mov_b32_e32 v2, s22
+; AGPR-NEXT: v_mov_b32_e32 v3, s23
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], 0
+; AGPR-NEXT: s_nop 4
+; AGPR-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s8
+; AGPR-NEXT: v_mov_b32_e32 v1, s9
+; AGPR-NEXT: v_mov_b32_e32 v2, s10
+; AGPR-NEXT: v_mov_b32_e32 v3, s11
+; AGPR-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v0, s12
+; AGPR-NEXT: v_mov_b32_e32 v1, s13
+; AGPR-NEXT: v_mov_b32_e32 v2, s14
+; AGPR-NEXT: v_mov_b32_e32 v3, s15
+; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: v_mov_b64_e32 v[40:41], 48
+; VGPR-NEXT: v_mov_b64_e32 v[42:43], 32
+; VGPR-NEXT: v_mov_b64_e32 v[44:45], 16
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: v_mov_b32_e32 v48, s16
+; VGPR-NEXT: v_mov_b32_e32 v49, s17
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; VGPR-NEXT: v_mov_b32_e32 v50, s18
+; VGPR-NEXT: v_mov_b32_e32 v51, s19
+; VGPR-NEXT: v_mov_b64_e32 v[46:47], 0
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: v_mov_b32_e32 v0, s20
+; VGPR-NEXT: v_mov_b32_e32 v1, s21
+; VGPR-NEXT: v_mov_b32_e32 v2, s22
+; VGPR-NEXT: v_mov_b32_e32 v3, s23
+; VGPR-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s8
+; VGPR-NEXT: v_mov_b32_e32 v1, s9
+; VGPR-NEXT: v_mov_b32_e32 v2, s10
+; VGPR-NEXT: v_mov_b32_e32 v3, s11
+; VGPR-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v0, s12
+; VGPR-NEXT: v_mov_b32_e32 v1, s13
+; VGPR-NEXT: v_mov_b32_e32 v2, s14
+; VGPR-NEXT: v_mov_b32_e32 v3, s15
+; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1)
store volatile <16 x float> %result, ptr addrspace(1) null
store volatile <16 x float> %arg2, ptr addrspace(1) null
@@ -196,6 +451,71 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
; GCN-NEXT: v_accvgpr_read_b32 v14, a14
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__mac:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 3
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT: v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT: v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT: v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT: v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT: v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT: v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT: v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT: v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT: v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__mac:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[8:23], v[0:3], v[4:7], v[8:23]
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 3
+; VGPR-NEXT: v_mov_b32_e32 v0, v8
+; VGPR-NEXT: v_mov_b32_e32 v1, v9
+; VGPR-NEXT: v_mov_b32_e32 v2, v10
+; VGPR-NEXT: v_mov_b32_e32 v3, v11
+; VGPR-NEXT: v_mov_b32_e32 v4, v12
+; VGPR-NEXT: v_mov_b32_e32 v5, v13
+; VGPR-NEXT: v_mov_b32_e32 v6, v14
+; VGPR-NEXT: v_mov_b32_e32 v7, v15
+; VGPR-NEXT: v_mov_b32_e32 v8, v16
+; VGPR-NEXT: v_mov_b32_e32 v9, v17
+; VGPR-NEXT: v_mov_b32_e32 v10, v18
+; VGPR-NEXT: v_mov_b32_e32 v11, v19
+; VGPR-NEXT: v_mov_b32_e32 v12, v20
+; VGPR-NEXT: v_mov_b32_e32 v13, v21
+; VGPR-NEXT: v_mov_b32_e32 v14, v22
+; VGPR-NEXT: v_mov_b32_e32 v15, v23
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
ret <16 x float> %result
}
@@ -240,6 +560,71 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
; GCN-NEXT: v_accvgpr_read_b32 v14, a14
; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; AGPR-NEXT: v_accvgpr_write_b32 a0, v8
+; AGPR-NEXT: v_accvgpr_write_b32 a1, v9
+; AGPR-NEXT: v_accvgpr_write_b32 a2, v10
+; AGPR-NEXT: v_accvgpr_write_b32 a3, v11
+; AGPR-NEXT: v_accvgpr_write_b32 a4, v12
+; AGPR-NEXT: v_accvgpr_write_b32 a5, v13
+; AGPR-NEXT: v_accvgpr_write_b32 a6, v14
+; AGPR-NEXT: v_accvgpr_write_b32 a7, v15
+; AGPR-NEXT: v_accvgpr_write_b32 a8, v16
+; AGPR-NEXT: v_accvgpr_write_b32 a9, v17
+; AGPR-NEXT: v_accvgpr_write_b32 a10, v18
+; AGPR-NEXT: v_accvgpr_write_b32 a11, v19
+; AGPR-NEXT: v_accvgpr_write_b32 a12, v20
+; AGPR-NEXT: v_accvgpr_write_b32 a13, v21
+; AGPR-NEXT: v_accvgpr_write_b32 a14, v22
+; AGPR-NEXT: v_accvgpr_write_b32 a15, v23
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 3
+; AGPR-NEXT: v_accvgpr_read_b32 v0, a0
+; AGPR-NEXT: v_accvgpr_read_b32 v1, a1
+; AGPR-NEXT: v_accvgpr_read_b32 v2, a2
+; AGPR-NEXT: v_accvgpr_read_b32 v3, a3
+; AGPR-NEXT: v_accvgpr_read_b32 v4, a4
+; AGPR-NEXT: v_accvgpr_read_b32 v5, a5
+; AGPR-NEXT: v_accvgpr_read_b32 v6, a6
+; AGPR-NEXT: v_accvgpr_read_b32 v7, a7
+; AGPR-NEXT: v_accvgpr_read_b32 v8, a8
+; AGPR-NEXT: v_accvgpr_read_b32 v9, a9
+; AGPR-NEXT: v_accvgpr_read_b32 v10, a10
+; AGPR-NEXT: v_accvgpr_read_b32 v11, a11
+; AGPR-NEXT: v_accvgpr_read_b32 v12, a12
+; AGPR-NEXT: v_accvgpr_read_b32 v13, a13
+; AGPR-NEXT: v_accvgpr_read_b32 v14, a14
+; AGPR-NEXT: v_accvgpr_read_b32 v15, a15
+; AGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__mac__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 3
+; VGPR-NEXT: v_mov_b32_e32 v0, v8
+; VGPR-NEXT: v_mov_b32_e32 v1, v9
+; VGPR-NEXT: v_mov_b32_e32 v2, v10
+; VGPR-NEXT: v_mov_b32_e32 v3, v11
+; VGPR-NEXT: v_mov_b32_e32 v4, v12
+; VGPR-NEXT: v_mov_b32_e32 v5, v13
+; VGPR-NEXT: v_mov_b32_e32 v6, v14
+; VGPR-NEXT: v_mov_b32_e32 v7, v15
+; VGPR-NEXT: v_mov_b32_e32 v8, v16
+; VGPR-NEXT: v_mov_b32_e32 v9, v17
+; VGPR-NEXT: v_mov_b32_e32 v10, v18
+; VGPR-NEXT: v_mov_b32_e32 v11, v19
+; VGPR-NEXT: v_mov_b32_e32 v12, v20
+; VGPR-NEXT: v_mov_b32_e32 v13, v21
+; VGPR-NEXT: v_mov_b32_e32 v14, v22
+; VGPR-NEXT: v_mov_b32_e32 v15, v23
+; VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1)
ret <16 x float> %result
}
@@ -301,6 +686,120 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd(<8 x bfloat> %arg
; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: v_mov_b32_e32 v44, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; AGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; AGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; AGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; AGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; AGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; AGPR-NEXT: v_mov_b32_e32 v40, s20
+; AGPR-NEXT: v_mov_b32_e32 v41, s21
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
+; AGPR-NEXT: v_mov_b32_e32 v42, s22
+; AGPR-NEXT: v_mov_b32_e32 v43, s23
+; AGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: v_mov_b32_e32 v16, s16
+; AGPR-NEXT: v_mov_b32_e32 v17, s17
+; AGPR-NEXT: v_mov_b32_e32 v18, s18
+; AGPR-NEXT: v_mov_b32_e32 v19, s19
+; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v16, s12
+; AGPR-NEXT: v_mov_b32_e32 v17, s13
+; AGPR-NEXT: v_mov_b32_e32 v18, s14
+; AGPR-NEXT: v_mov_b32_e32 v19, s15
+; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v16, s8
+; AGPR-NEXT: v_mov_b32_e32 v17, s9
+; AGPR-NEXT: v_mov_b32_e32 v18, s10
+; AGPR-NEXT: v_mov_b32_e32 v19, s11
+; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: v_mov_b32_e32 v44, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT: v_mov_b32_e32 v40, s20
+; VGPR-NEXT: v_mov_b32_e32 v41, s21
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPR-NEXT: v_mov_b32_e32 v42, s22
+; VGPR-NEXT: v_mov_b32_e32 v43, s23
+; VGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: v_mov_b32_e32 v16, s16
+; VGPR-NEXT: v_mov_b32_e32 v17, s17
+; VGPR-NEXT: v_mov_b32_e32 v18, s18
+; VGPR-NEXT: v_mov_b32_e32 v19, s19
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s12
+; VGPR-NEXT: v_mov_b32_e32 v17, s13
+; VGPR-NEXT: v_mov_b32_e32 v18, s14
+; VGPR-NEXT: v_mov_b32_e32 v19, s15
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s8
+; VGPR-NEXT: v_mov_b32_e32 v17, s9
+; VGPR-NEXT: v_mov_b32_e32 v18, s10
+; VGPR-NEXT: v_mov_b32_e32 v19, s11
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
store volatile <16 x float> %arg2, ptr addrspace(1) %out
store volatile <16 x float> %result, ptr addrspace(1) %out
@@ -364,6 +863,120 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd__flags(<8 x bfloa
; GCN-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd__flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: v_mov_b32_e32 v44, 0
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; AGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; AGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; AGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; AGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; AGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; AGPR-NEXT: v_mov_b32_e32 v40, s20
+; AGPR-NEXT: v_mov_b32_e32 v41, s21
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; AGPR-NEXT: v_mov_b32_e32 v42, s22
+; AGPR-NEXT: v_mov_b32_e32 v43, s23
+; AGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: v_mov_b32_e32 v16, s16
+; AGPR-NEXT: v_mov_b32_e32 v17, s17
+; AGPR-NEXT: v_mov_b32_e32 v18, s18
+; AGPR-NEXT: v_mov_b32_e32 v19, s19
+; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v16, s12
+; AGPR-NEXT: v_mov_b32_e32 v17, s13
+; AGPR-NEXT: v_mov_b32_e32 v18, s14
+; AGPR-NEXT: v_mov_b32_e32 v19, s15
+; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_nop 0
+; AGPR-NEXT: v_mov_b32_e32 v16, s8
+; AGPR-NEXT: v_mov_b32_e32 v17, s9
+; AGPR-NEXT: v_mov_b32_e32 v18, s10
+; AGPR-NEXT: v_mov_b32_e32 v19, s11
+; AGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; AGPR-NEXT: s_waitcnt vmcnt(0)
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd__flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: v_mov_b32_e32 v44, 0
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; VGPR-NEXT: v_mov_b32_e32 v40, s20
+; VGPR-NEXT: v_mov_b32_e32 v41, s21
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPR-NEXT: v_mov_b32_e32 v42, s22
+; VGPR-NEXT: v_mov_b32_e32 v43, s23
+; VGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: v_mov_b32_e32 v16, s16
+; VGPR-NEXT: v_mov_b32_e32 v17, s17
+; VGPR-NEXT: v_mov_b32_e32 v18, s18
+; VGPR-NEXT: v_mov_b32_e32 v19, s19
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s12
+; VGPR-NEXT: v_mov_b32_e32 v17, s13
+; VGPR-NEXT: v_mov_b32_e32 v18, s14
+; VGPR-NEXT: v_mov_b32_e32 v19, s15
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_nop 0
+; VGPR-NEXT: v_mov_b32_e32 v16, s8
+; VGPR-NEXT: v_mov_b32_e32 v17, s9
+; VGPR-NEXT: v_mov_b32_e32 v18, s10
+; VGPR-NEXT: v_mov_b32_e32 v19, s11
+; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPR-NEXT: s_waitcnt vmcnt(0)
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3)
store volatile <16 x float> %arg2, ptr addrspace(1) %out
store volatile <16 x float> %result, ptr addrspace(1) %out
@@ -398,6 +1011,64 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; AGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
+; AGPR-NEXT: v_mov_b32_e32 v16, 0
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; AGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; AGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; AGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPR-NEXT: v_mov_b32_e32 v16, 0
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0)
store <16 x float> %result, ptr addrspace(1) %out
ret void
@@ -431,6 +1102,64 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GCN-NEXT: s_endpgm
+;
+; AGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags:
+; AGPR: ; %bb.0:
+; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; AGPR-NEXT: s_waitcnt lgkmcnt(0)
+; AGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; AGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; AGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; AGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; AGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; AGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; AGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; AGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; AGPR-NEXT: s_nop 1
+; AGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; AGPR-NEXT: v_mov_b32_e32 v16, 0
+; AGPR-NEXT: s_nop 7
+; AGPR-NEXT: s_nop 2
+; AGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; AGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; AGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; AGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; AGPR-NEXT: s_endpgm
+;
+; VGPR-LABEL: test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags:
+; VGPR: ; %bb.0:
+; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
+; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
+; VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; VGPR-NEXT: s_nop 1
+; VGPR-NEXT: v_mfma_f32_32x32x16_bf16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPR-NEXT: v_mov_b32_e32 v16, 0
+; VGPR-NEXT: s_nop 7
+; VGPR-NEXT: s_nop 2
+; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; VGPR-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1)
store <16 x float> %result, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 7532062..65beb18 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -15,17 +15,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg
; GCN-LABEL: test_mfma_f32_16x16x32_f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16:
@@ -77,17 +67,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal
; GCN-LABEL: test_mfma_f32_16x16x32_f16__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags:
@@ -382,66 +362,58 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[40:41], 48
+; SDAG-NEXT: v_mov_b64_e32 v[42:43], 32
+; SDAG-NEXT: v_mov_b64_e32 v[44:45], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
-; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; SDAG-NEXT: v_mov_b32_e32 v48, s16
+; SDAG-NEXT: v_mov_b32_e32 v49, s17
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v50, s18
+; SDAG-NEXT: v_mov_b32_e32 v51, s19
+; SDAG-NEXT: v_mov_b64_e32 v[46:47], 0
+; SDAG-NEXT: s_nop 8
+; SDAG-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -449,58 +421,50 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], 16
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], 48
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: s_nop 10
+; GISEL-NEXT: global_store_dwordx4 v[44:45], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[46:47], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[44:45], v[40:43], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -765,66 +729,58 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[8:9], 48
-; SDAG-NEXT: v_mov_b64_e32 v[10:11], 32
-; SDAG-NEXT: v_mov_b64_e32 v[12:13], 16
+; SDAG-NEXT: v_mov_b64_e32 v[40:41], 48
+; SDAG-NEXT: v_mov_b64_e32 v[42:43], 32
+; SDAG-NEXT: v_mov_b64_e32 v[44:45], 16
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v16, s16
-; SDAG-NEXT: v_mov_b32_e32 v17, s17
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v18, s18
-; SDAG-NEXT: v_mov_b32_e32 v19, s19
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
-; SDAG-NEXT: v_mov_b64_e32 v[14:15], 0
-; SDAG-NEXT: s_nop 4
-; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; SDAG-NEXT: v_mov_b32_e32 v48, s16
+; SDAG-NEXT: v_mov_b32_e32 v49, s17
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v50, s18
+; SDAG-NEXT: v_mov_b32_e32 v51, s19
+; SDAG-NEXT: v_mov_b64_e32 v[46:47], 0
+; SDAG-NEXT: s_nop 8
+; SDAG-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[10:11], v[16:19], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: global_store_dwordx4 v[42:43], v[48:51], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
; SDAG-NEXT: v_mov_b32_e32 v2, s10
; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -832,58 +788,50 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, <
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], 16
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], 48
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: s_nop 10
+; GISEL-NEXT: global_store_dwordx4 v[44:45], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[46:47], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[44:45], v[40:43], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1147,41 +1095,24 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half
; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23]
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v8
+; GCN-NEXT: v_mov_b32_e32 v1, v9
+; GCN-NEXT: v_mov_b32_e32 v2, v10
+; GCN-NEXT: v_mov_b32_e32 v3, v11
+; GCN-NEXT: v_mov_b32_e32 v4, v12
+; GCN-NEXT: v_mov_b32_e32 v5, v13
+; GCN-NEXT: v_mov_b32_e32 v6, v14
+; GCN-NEXT: v_mov_b32_e32 v7, v15
+; GCN-NEXT: v_mov_b32_e32 v8, v16
+; GCN-NEXT: v_mov_b32_e32 v9, v17
+; GCN-NEXT: v_mov_b32_e32 v10, v18
+; GCN-NEXT: v_mov_b32_e32 v11, v19
+; GCN-NEXT: v_mov_b32_e32 v12, v20
+; GCN-NEXT: v_mov_b32_e32 v13, v21
+; GCN-NEXT: v_mov_b32_e32 v14, v22
+; GCN-NEXT: v_mov_b32_e32 v15, v23
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac:
@@ -1317,41 +1248,24 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8
; GCN-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v8
+; GCN-NEXT: v_mov_b32_e32 v1, v9
+; GCN-NEXT: v_mov_b32_e32 v2, v10
+; GCN-NEXT: v_mov_b32_e32 v3, v11
+; GCN-NEXT: v_mov_b32_e32 v4, v12
+; GCN-NEXT: v_mov_b32_e32 v5, v13
+; GCN-NEXT: v_mov_b32_e32 v6, v14
+; GCN-NEXT: v_mov_b32_e32 v7, v15
+; GCN-NEXT: v_mov_b32_e32 v8, v16
+; GCN-NEXT: v_mov_b32_e32 v9, v17
+; GCN-NEXT: v_mov_b32_e32 v10, v18
+; GCN-NEXT: v_mov_b32_e32 v11, v19
+; GCN-NEXT: v_mov_b32_e32 v12, v20
+; GCN-NEXT: v_mov_b32_e32 v13, v21
+; GCN-NEXT: v_mov_b32_e32 v14, v22
+; GCN-NEXT: v_mov_b32_e32 v15, v23
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags:
@@ -2539,17 +2453,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4
; GCN-LABEL: test_mfma_i32_16x16x64_i8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8:
@@ -2601,17 +2505,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a
; GCN-LABEL: test_mfma_i32_16x16x64_i8__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags:
@@ -2964,70 +2858,67 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], 48
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], 32
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b32_e32 v8, s28
-; SDAG-NEXT: v_mov_b32_e32 v9, s29
-; SDAG-NEXT: v_mov_b32_e32 v10, s30
-; SDAG-NEXT: v_mov_b32_e32 v11, s31
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b32_e32 v40, s28
+; SDAG-NEXT: v_mov_b32_e32 v41, s29
+; SDAG-NEXT: v_mov_b32_e32 v42, s30
+; SDAG-NEXT: v_mov_b32_e32 v43, s31
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: s_nop 5
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 0
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3035,58 +2926,50 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], 16
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], 48
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: s_nop 10
+; GISEL-NEXT: global_store_dwordx4 v[44:45], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[46:47], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[44:45], v[40:43], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -3376,70 +3259,67 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], 48
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], 32
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b32_e32 v8, s28
-; SDAG-NEXT: v_mov_b32_e32 v9, s29
-; SDAG-NEXT: v_mov_b32_e32 v10, s30
-; SDAG-NEXT: v_mov_b32_e32 v11, s31
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b32_e32 v40, s28
+; SDAG-NEXT: v_mov_b32_e32 v41, s29
+; SDAG-NEXT: v_mov_b32_e32 v42, s30
+; SDAG-NEXT: v_mov_b32_e32 v43, s31
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31] cbsz:2 abid:3 blgp:1
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: v_mov_b32_e32 v8, s16
-; SDAG-NEXT: v_mov_b32_e32 v9, s17
-; SDAG-NEXT: v_mov_b32_e32 v10, s18
-; SDAG-NEXT: v_mov_b32_e32 v11, s19
-; SDAG-NEXT: s_nop 5
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT: s_nop 11
+; SDAG-NEXT: global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[34:35], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], 16
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], 0
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: global_store_dwordx4 v[34:35], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
-; SDAG-NEXT: v_mov_b32_e32 v9, s21
-; SDAG-NEXT: v_mov_b32_e32 v10, s22
-; SDAG-NEXT: v_mov_b32_e32 v11, s23
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[8:11], off sc0 sc1
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s8
; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[0:3], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
; SDAG-NEXT: v_mov_b32_e32 v0, s12
; SDAG-NEXT: v_mov_b32_e32 v1, s13
; SDAG-NEXT: v_mov_b32_e32 v2, s14
; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -3447,58 +3327,50 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], 0
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], 16
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 32
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], 0
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], 16
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], 32
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 48
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: s_nop 8
-; GISEL-NEXT: global_store_dwordx4 v[12:13], a[16:19], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], 48
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: s_nop 10
+; GISEL-NEXT: global_store_dwordx4 v[44:45], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], a[20:23], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[46:47], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[24:27], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[28:31], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[12:13], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[44:45], v[40:43], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -3787,41 +3659,24 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar
; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
+; GCN-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23]
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v8
+; GCN-NEXT: v_mov_b32_e32 v1, v9
+; GCN-NEXT: v_mov_b32_e32 v2, v10
+; GCN-NEXT: v_mov_b32_e32 v3, v11
+; GCN-NEXT: v_mov_b32_e32 v4, v12
+; GCN-NEXT: v_mov_b32_e32 v5, v13
+; GCN-NEXT: v_mov_b32_e32 v6, v14
+; GCN-NEXT: v_mov_b32_e32 v7, v15
+; GCN-NEXT: v_mov_b32_e32 v8, v16
+; GCN-NEXT: v_mov_b32_e32 v9, v17
+; GCN-NEXT: v_mov_b32_e32 v10, v18
+; GCN-NEXT: v_mov_b32_e32 v11, v19
+; GCN-NEXT: v_mov_b32_e32 v12, v20
+; GCN-NEXT: v_mov_b32_e32 v13, v21
+; GCN-NEXT: v_mov_b32_e32 v14, v22
+; GCN-NEXT: v_mov_b32_e32 v15, v23
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac:
@@ -3957,41 +3812,24 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
; GCN-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
+; GCN-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v8
+; GCN-NEXT: v_mov_b32_e32 v1, v9
+; GCN-NEXT: v_mov_b32_e32 v2, v10
+; GCN-NEXT: v_mov_b32_e32 v3, v11
+; GCN-NEXT: v_mov_b32_e32 v4, v12
+; GCN-NEXT: v_mov_b32_e32 v5, v13
+; GCN-NEXT: v_mov_b32_e32 v6, v14
+; GCN-NEXT: v_mov_b32_e32 v7, v15
+; GCN-NEXT: v_mov_b32_e32 v8, v16
+; GCN-NEXT: v_mov_b32_e32 v9, v17
+; GCN-NEXT: v_mov_b32_e32 v10, v18
+; GCN-NEXT: v_mov_b32_e32 v11, v19
+; GCN-NEXT: v_mov_b32_e32 v12, v20
+; GCN-NEXT: v_mov_b32_e32 v13, v21
+; GCN-NEXT: v_mov_b32_e32 v14, v22
+; GCN-NEXT: v_mov_b32_e32 v15, v23
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags:
@@ -5299,17 +5137,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat>
; GCN-LABEL: test_mfma_f32_16x16x32_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16:
@@ -5361,17 +5189,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x
; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
index d24f1f0..61593a8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.i8.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A-VGPR %s
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
@@ -109,6 +110,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_i32_32x32x8i8:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v17, 2
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_i32_32x32x8i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-VGPR-NEXT: s_nop 15
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX90A-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -163,6 +191,23 @@ define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: s_nop 10
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
+;
+; GFX90A-VGPR-LABEL: test_mfma_i32_16x16x16i8:
+; GFX90A-VGPR: ; %bb.0: ; %bb
+; GFX90A-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v4, 1
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v6, 2
+; GFX90A-VGPR-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX90A-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-VGPR-NEXT: s_nop 1
+; GFX90A-VGPR-NEXT: v_mfma_i32_16x16x16i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-VGPR-NEXT: s_nop 10
+; GFX90A-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
+; GFX90A-VGPR-NEXT: s_endpgm
bb:
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 7e30af9..c31ea52 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -294,113 +294,113 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_f32_32x32x1f32:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX90A-NEXT: v_mov_b32_e32 v0, s16
+; GFX90A-NEXT: v_mov_b32_e32 v1, s17
+; GFX90A-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-NEXT: v_mov_b32_e32 v3, s19
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
+; GFX90A-NEXT: v_mov_b32_e32 v5, s21
+; GFX90A-NEXT: v_mov_b32_e32 v6, s22
+; GFX90A-NEXT: v_mov_b32_e32 v7, s23
+; GFX90A-NEXT: v_mov_b32_e32 v8, s24
+; GFX90A-NEXT: v_mov_b32_e32 v9, s25
+; GFX90A-NEXT: v_mov_b32_e32 v10, s26
+; GFX90A-NEXT: v_mov_b32_e32 v11, s27
+; GFX90A-NEXT: v_mov_b32_e32 v12, s28
+; GFX90A-NEXT: v_mov_b32_e32 v13, s29
+; GFX90A-NEXT: v_mov_b32_e32 v14, s30
+; GFX90A-NEXT: v_mov_b32_e32 v15, s31
+; GFX90A-NEXT: v_mov_b32_e32 v16, s0
+; GFX90A-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-NEXT: v_mov_b32_e32 v18, s2
+; GFX90A-NEXT: v_mov_b32_e32 v19, s3
+; GFX90A-NEXT: v_mov_b32_e32 v20, s4
+; GFX90A-NEXT: v_mov_b32_e32 v21, s5
+; GFX90A-NEXT: v_mov_b32_e32 v22, s6
+; GFX90A-NEXT: v_mov_b32_e32 v23, s7
+; GFX90A-NEXT: v_mov_b32_e32 v24, s8
+; GFX90A-NEXT: v_mov_b32_e32 v25, s9
+; GFX90A-NEXT: v_mov_b32_e32 v26, s10
+; GFX90A-NEXT: v_mov_b32_e32 v27, s11
+; GFX90A-NEXT: v_mov_b32_e32 v28, s12
+; GFX90A-NEXT: v_mov_b32_e32 v29, s13
+; GFX90A-NEXT: v_mov_b32_e32 v30, s14
+; GFX90A-NEXT: v_mov_b32_e32 v31, s15
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x1f32:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX942-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-NEXT: v_mov_b32_e32 v31, s15
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32:
@@ -603,69 +603,53 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_f32_16x16x1f32:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v17, 2.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
; GFX90A-NEXT: s_nop 9
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x1f32:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v17, 2.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32:
@@ -760,39 +744,35 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_f32_4x4x1f32:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x1f32:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 3
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32:
@@ -956,71 +936,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_f32_32x32x2f32:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v17, 2.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x2f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x2f32:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v17, 2.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x2f32:
@@ -1116,39 +1080,35 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_f32_16x16x4f32:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_16x16x4f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 10
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x4f32:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x4_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 9
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f32:
@@ -1456,121 +1416,121 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-LABEL: test_mfma_f32_32x32x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
+; GFX90A-NEXT: v_mov_b32_e32 v16, s0
+; GFX90A-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-NEXT: v_mov_b32_e32 v18, s2
+; GFX90A-NEXT: v_mov_b32_e32 v19, s3
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
+; GFX90A-NEXT: v_mov_b32_e32 v0, s16
+; GFX90A-NEXT: v_mov_b32_e32 v1, s17
+; GFX90A-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-NEXT: v_mov_b32_e32 v3, s19
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: v_mov_b32_e32 v3, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX90A-NEXT: v_mov_b32_e32 v4, s2
-; GFX90A-NEXT: v_mov_b32_e32 v5, s3
+; GFX90A-NEXT: v_mov_b32_e32 v34, s0
+; GFX90A-NEXT: v_mov_b32_e32 v35, s1
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
+; GFX90A-NEXT: v_mov_b32_e32 v5, s21
+; GFX90A-NEXT: v_mov_b32_e32 v6, s22
+; GFX90A-NEXT: v_mov_b32_e32 v7, s23
+; GFX90A-NEXT: v_mov_b32_e32 v8, s24
+; GFX90A-NEXT: v_mov_b32_e32 v9, s25
+; GFX90A-NEXT: v_mov_b32_e32 v10, s26
+; GFX90A-NEXT: v_mov_b32_e32 v11, s27
+; GFX90A-NEXT: v_mov_b32_e32 v12, s28
+; GFX90A-NEXT: v_mov_b32_e32 v13, s29
+; GFX90A-NEXT: v_mov_b32_e32 v14, s30
+; GFX90A-NEXT: v_mov_b32_e32 v15, s31
+; GFX90A-NEXT: v_mov_b32_e32 v20, s4
+; GFX90A-NEXT: v_mov_b32_e32 v21, s5
+; GFX90A-NEXT: v_mov_b32_e32 v22, s6
+; GFX90A-NEXT: v_mov_b32_e32 v23, s7
+; GFX90A-NEXT: v_mov_b32_e32 v24, s8
+; GFX90A-NEXT: v_mov_b32_e32 v25, s9
+; GFX90A-NEXT: v_mov_b32_e32 v26, s10
+; GFX90A-NEXT: v_mov_b32_e32 v27, s11
+; GFX90A-NEXT: v_mov_b32_e32 v28, s12
+; GFX90A-NEXT: v_mov_b32_e32 v29, s13
+; GFX90A-NEXT: v_mov_b32_e32 v30, s14
+; GFX90A-NEXT: v_mov_b32_e32 v31, s15
+; GFX90A-NEXT: v_mov_b32_e32 v36, s2
+; GFX90A-NEXT: v_mov_b32_e32 v37, s3
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_32x32x4f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[36:37] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[36:37] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[36:37] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[36:37] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[36:37]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[36:37] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a19, s3
+; GFX942-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-NEXT: v_mov_b32_e32 v19, s3
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
+; GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-NEXT: v_mov_b32_e32 v3, s19
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s0
-; GFX942-NEXT: v_mov_b32_e32 v3, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX942-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
-; GFX942-NEXT: v_mov_b32_e32 v4, s2
-; GFX942-NEXT: v_mov_b32_e32 v5, s3
+; GFX942-NEXT: v_mov_b32_e32 v34, s0
+; GFX942-NEXT: v_mov_b32_e32 v35, s1
+; GFX942-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-NEXT: v_mov_b32_e32 v31, s15
+; GFX942-NEXT: v_mov_b32_e32 v36, s2
+; GFX942-NEXT: v_mov_b32_e32 v37, s3
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37]
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[36:37] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[36:37] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[36:37] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[36:37] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[36:37]
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[36:37] offset:16
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4f16:
@@ -1790,34 +1750,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: v_mov_b32_e32 v1, s21
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v2, s22
-; GFX90A-NEXT: v_mov_b32_e32 v3, s23
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: v_mov_b32_e32 v16, s20
+; GFX90A-NEXT: v_mov_b32_e32 v17, s21
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v18, s22
+; GFX90A-NEXT: v_mov_b32_e32 v19, s23
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f32_16x16x4f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
; GFX90A-NEXT: s_nop 9
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x4f16:
@@ -1827,34 +1779,26 @@ define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s20
-; GFX942-NEXT: v_mov_b32_e32 v1, s21
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_mov_b32_e32 v2, s22
-; GFX942-NEXT: v_mov_b32_e32 v3, s23
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: v_mov_b32_e32 v16, s20
+; GFX942-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v18, s22
+; GFX942-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
; GFX942-NEXT: s_nop 9
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f16:
@@ -1961,45 +1905,41 @@ define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr add
; GFX90A-LABEL: test_mfma_f32_4x4x4f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: v_mov_b32_e32 v3, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v4, s6
-; GFX90A-NEXT: v_mov_b32_e32 v5, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX90A-NEXT: v_mov_b32_e32 v6, s4
+; GFX90A-NEXT: v_mov_b32_e32 v7, s5
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v8, s6
+; GFX90A-NEXT: v_mov_b32_e32 v9, s7
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_4x4x4f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x4f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s4
-; GFX942-NEXT: v_mov_b32_e32 v3, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v4, s6
-; GFX942-NEXT: v_mov_b32_e32 v5, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-NEXT: v_mov_b32_e32 v6, s4
+; GFX942-NEXT: v_mov_b32_e32 v7, s5
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-NEXT: v_mov_b32_e32 v8, s6
+; GFX942-NEXT: v_mov_b32_e32 v9, s7
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 4
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16:
@@ -2179,35 +2119,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX90A-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s20
-; GFX90A-NEXT: v_mov_b32_e32 v1, s21
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v2, s22
-; GFX90A-NEXT: v_mov_b32_e32 v3, s23
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: v_mov_b32_e32 v16, s20
+; GFX90A-NEXT: v_mov_b32_e32 v17, s21
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v18, s22
+; GFX90A-NEXT: v_mov_b32_e32 v19, s23
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x8f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x8f16:
@@ -2217,34 +2149,26 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr a
; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s20
-; GFX942-NEXT: v_mov_b32_e32 v1, s21
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_mov_b32_e32 v2, s22
-; GFX942-NEXT: v_mov_b32_e32 v3, s23
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: v_mov_b32_e32 v16, s20
+; GFX942-NEXT: v_mov_b32_e32 v17, s21
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b32_e32 v18, s22
+; GFX942-NEXT: v_mov_b32_e32 v19, s23
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
; GFX942-NEXT: s_nop 9
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16:
@@ -2351,45 +2275,41 @@ define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr
; GFX90A-LABEL: test_mfma_f32_16x16x16f16:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s4
-; GFX90A-NEXT: v_mov_b32_e32 v3, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX90A-NEXT: v_mov_b32_e32 v4, s6
-; GFX90A-NEXT: v_mov_b32_e32 v5, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX90A-NEXT: v_mov_b32_e32 v6, s4
+; GFX90A-NEXT: v_mov_b32_e32 v7, s5
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v8, s6
+; GFX90A-NEXT: v_mov_b32_e32 v9, s7
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_16x16x16f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 10
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x16f16:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v2, s4
-; GFX942-NEXT: v_mov_b32_e32 v3, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s8
-; GFX942-NEXT: v_mov_b32_e32 v4, s6
-; GFX942-NEXT: v_mov_b32_e32 v5, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX942-NEXT: v_mov_b32_e32 v6, s4
+; GFX942-NEXT: v_mov_b32_e32 v7, s5
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-NEXT: v_mov_b32_e32 v8, s6
+; GFX942-NEXT: v_mov_b32_e32 v9, s7
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16:
@@ -2667,113 +2587,113 @@ define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_i32_32x32x4i8:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1
+; GFX90A-NEXT: v_mov_b32_e32 v34, 2
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX90A-NEXT: v_mov_b32_e32 v0, s16
+; GFX90A-NEXT: v_mov_b32_e32 v1, s17
+; GFX90A-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-NEXT: v_mov_b32_e32 v3, s19
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
+; GFX90A-NEXT: v_mov_b32_e32 v5, s21
+; GFX90A-NEXT: v_mov_b32_e32 v6, s22
+; GFX90A-NEXT: v_mov_b32_e32 v7, s23
+; GFX90A-NEXT: v_mov_b32_e32 v8, s24
+; GFX90A-NEXT: v_mov_b32_e32 v9, s25
+; GFX90A-NEXT: v_mov_b32_e32 v10, s26
+; GFX90A-NEXT: v_mov_b32_e32 v11, s27
+; GFX90A-NEXT: v_mov_b32_e32 v12, s28
+; GFX90A-NEXT: v_mov_b32_e32 v13, s29
+; GFX90A-NEXT: v_mov_b32_e32 v14, s30
+; GFX90A-NEXT: v_mov_b32_e32 v15, s31
+; GFX90A-NEXT: v_mov_b32_e32 v16, s0
+; GFX90A-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-NEXT: v_mov_b32_e32 v18, s2
+; GFX90A-NEXT: v_mov_b32_e32 v19, s3
+; GFX90A-NEXT: v_mov_b32_e32 v20, s4
+; GFX90A-NEXT: v_mov_b32_e32 v21, s5
+; GFX90A-NEXT: v_mov_b32_e32 v22, s6
+; GFX90A-NEXT: v_mov_b32_e32 v23, s7
+; GFX90A-NEXT: v_mov_b32_e32 v24, s8
+; GFX90A-NEXT: v_mov_b32_e32 v25, s9
+; GFX90A-NEXT: v_mov_b32_e32 v26, s10
+; GFX90A-NEXT: v_mov_b32_e32 v27, s11
+; GFX90A-NEXT: v_mov_b32_e32 v28, s12
+; GFX90A-NEXT: v_mov_b32_e32 v29, s13
+; GFX90A-NEXT: v_mov_b32_e32 v30, s14
+; GFX90A-NEXT: v_mov_b32_e32 v31, s15
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_i32_32x32x4i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_i32_32x32x4i8:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v1, 1
-; GFX942-NEXT: v_mov_b32_e32 v2, 2
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1
+; GFX942-NEXT: v_mov_b32_e32 v34, 2
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX942-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-NEXT: v_mov_b32_e32 v31, s15
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_i32_32x32x4_2b_i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_i32_32x32x4i8:
@@ -2976,69 +2896,53 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_i32_16x16x4i8:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2
+; GFX90A-NEXT: v_mov_b32_e32 v16, 1
+; GFX90A-NEXT: v_mov_b32_e32 v17, 2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_i32_16x16x4i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
; GFX90A-NEXT: s_nop 9
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_i32_16x16x4i8:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1
-; GFX942-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-NEXT: v_mov_b32_e32 v16, 1
+; GFX942-NEXT: v_mov_b32_e32 v17, 2
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
; GFX942-NEXT: s_nop 9
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8:
@@ -3157,15 +3061,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; GFX90A-NEXT: v_mov_b32_e32 v0, 1
; GFX90A-NEXT: v_mov_b32_e32 v1, 2
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-NEXT: v_mfma_i32_16x16x4i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 8
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_nop 9
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
@@ -3173,15 +3076,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
; GFX942-NEXT: v_mov_b32_e32 v0, 1
; GFX942-NEXT: v_mov_b32_e32 v1, 2
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_nop 9
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
@@ -3265,39 +3167,35 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_i32_4x4x4i8:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 1
+; GFX90A-NEXT: v_mov_b32_e32 v6, 2
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_i32_4x4x4i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX90A-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_i32_4x4x4i8:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1
-; GFX942-NEXT: v_mov_b32_e32 v2, 2
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 1
+; GFX942-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 4
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8:
@@ -3368,26 +3266,26 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace(
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: v_mov_b32_e32 v0, 1
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_i32_4x4x4i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 3
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v0, 1
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 2
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 3
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1:
@@ -3458,38 +3356,36 @@ define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1)
;
; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
; GFX90A: ; %bb.0:
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1
+; GFX90A-NEXT: v_mov_b32_e32 v5, 1
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x41
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 2
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_i32_4x4x4i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 3
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
; GFX942: ; %bb.0:
-; GFX942-NEXT: v_mov_b32_e32 v1, 0x41
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-NEXT: v_mov_b32_e32 v5, 1
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_mov_b32_e32 v2, 2
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x41
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, 2
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 3
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1:
@@ -3790,115 +3686,115 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1)
; GFX90A-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX90A-NEXT: v_mov_b32_e32 v0, s16
+; GFX90A-NEXT: v_mov_b32_e32 v1, s17
+; GFX90A-NEXT: v_mov_b32_e32 v2, s18
+; GFX90A-NEXT: v_mov_b32_e32 v3, s19
+; GFX90A-NEXT: v_mov_b32_e32 v4, s20
+; GFX90A-NEXT: v_mov_b32_e32 v5, s21
+; GFX90A-NEXT: v_mov_b32_e32 v6, s22
+; GFX90A-NEXT: v_mov_b32_e32 v7, s23
+; GFX90A-NEXT: v_mov_b32_e32 v8, s24
+; GFX90A-NEXT: v_mov_b32_e32 v9, s25
+; GFX90A-NEXT: v_mov_b32_e32 v10, s26
+; GFX90A-NEXT: v_mov_b32_e32 v11, s27
+; GFX90A-NEXT: v_mov_b32_e32 v12, s28
+; GFX90A-NEXT: v_mov_b32_e32 v13, s29
+; GFX90A-NEXT: v_mov_b32_e32 v14, s30
+; GFX90A-NEXT: v_mov_b32_e32 v15, s31
+; GFX90A-NEXT: v_mov_b32_e32 v16, s0
+; GFX90A-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-NEXT: v_mov_b32_e32 v18, s2
+; GFX90A-NEXT: v_mov_b32_e32 v19, s3
+; GFX90A-NEXT: v_mov_b32_e32 v20, s4
+; GFX90A-NEXT: v_mov_b32_e32 v21, s5
+; GFX90A-NEXT: v_mov_b32_e32 v22, s6
+; GFX90A-NEXT: v_mov_b32_e32 v23, s7
+; GFX90A-NEXT: v_mov_b32_e32 v24, s8
+; GFX90A-NEXT: v_mov_b32_e32 v25, s9
+; GFX90A-NEXT: v_mov_b32_e32 v26, s10
+; GFX90A-NEXT: v_mov_b32_e32 v27, s11
+; GFX90A-NEXT: v_mov_b32_e32 v28, s12
+; GFX90A-NEXT: v_mov_b32_e32 v29, s13
+; GFX90A-NEXT: v_mov_b32_e32 v30, s14
+; GFX90A-NEXT: v_mov_b32_e32 v31, s15
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 2.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX942-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-NEXT: v_mov_b32_e32 v31, s15
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_forward_acc:
@@ -4103,71 +3999,55 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1)
; GFX90A-LABEL: test_mfma_f32_16x16x1f32_forward_acc:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v17, 2.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v16, v17, v[0:15]
+; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v16, v17, v[0:15]
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
; GFX90A-NEXT: s_nop 9
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x1f32_forward_acc:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v17, 2.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
-; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
+; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15]
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_forward_acc:
@@ -4266,42 +4146,38 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %
; GFX90A-LABEL: test_mfma_f32_4x4x1f32_forward_acc:
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3]
-; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3]
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v4, v5, v[0:3]
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v4, v5, v[0:3]
; GFX90A-NEXT: s_nop 4
-; GFX90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7]
+; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x1f32_forward_acc:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3]
+; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3]
+; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3]
; GFX942-NEXT: s_nop 3
-; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7]
+; GFX942-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_forward_acc:
@@ -4375,26 +4251,26 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %ar
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v2, 1.0
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, 1.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 3
-; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x1f32_imm_splat:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v2, 1.0
+; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, 1.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm_splat:
@@ -4509,15 +4385,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v0, v1, 1.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 8
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_nop 9
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
@@ -4525,15 +4400,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 7
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_nop 8
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
@@ -4657,16 +4531,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40004000
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x8f16 v[0:15], v[0:1], v[2:3], 1.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 15
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
@@ -4676,15 +4549,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: v_mov_b32_e32 v2, 0x40004000
; GFX942-NEXT: v_mov_b32_e32 v3, v2
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], 1.0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_nop 9
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
@@ -4870,20 +4742,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v0, v1, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 15
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
@@ -4891,19 +4762,19 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 15
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
@@ -4978,36 +4849,32 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0
;
; GFX90A-LABEL: test_mfma_f32_4x4x1f32_imm:
; GFX90A: ; %bb.0: ; %bb
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3]
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v0, v1, v[0:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 3
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x1f32_imm:
; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 2.0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3]
+; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm:
@@ -5141,64 +5008,60 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #
;
; GFX90A-LABEL: test_mfma_f32_16x16x1f32_imm:
; GFX90A: ; %bb.0: ; %bb
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 2.0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v15, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v14, v0
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v0, v15, v[0:15]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 9
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_16x16x1f32_imm:
; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 2.0
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v15, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-NEXT: v_mov_b32_e32 v14, v0
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15]
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm:
@@ -5436,106 +5299,136 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #
;
; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm:
; GFX90A: ; %bb.0: ; %bb
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a1
-; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a1
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v1
+; GFX90A-NEXT: v_mov_b32_e32 v3, v1
+; GFX90A-NEXT: v_mov_b32_e32 v4, v1
+; GFX90A-NEXT: v_mov_b32_e32 v5, v1
+; GFX90A-NEXT: v_mov_b32_e32 v6, v1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v1
+; GFX90A-NEXT: v_mov_b32_e32 v8, v1
+; GFX90A-NEXT: v_mov_b32_e32 v9, v1
+; GFX90A-NEXT: v_mov_b32_e32 v10, v1
+; GFX90A-NEXT: v_mov_b32_e32 v11, v1
+; GFX90A-NEXT: v_mov_b32_e32 v12, v1
+; GFX90A-NEXT: v_mov_b32_e32 v13, v1
+; GFX90A-NEXT: v_mov_b32_e32 v14, v1
+; GFX90A-NEXT: v_mov_b32_e32 v15, v1
+; GFX90A-NEXT: v_mov_b32_e32 v16, v1
+; GFX90A-NEXT: v_mov_b32_e32 v17, v1
+; GFX90A-NEXT: v_mov_b32_e32 v18, v1
+; GFX90A-NEXT: v_mov_b32_e32 v19, v1
+; GFX90A-NEXT: v_mov_b32_e32 v20, v1
+; GFX90A-NEXT: v_mov_b32_e32 v21, v1
+; GFX90A-NEXT: v_mov_b32_e32 v22, v1
+; GFX90A-NEXT: v_mov_b32_e32 v23, v1
+; GFX90A-NEXT: v_mov_b32_e32 v24, v1
+; GFX90A-NEXT: v_mov_b32_e32 v25, v1
+; GFX90A-NEXT: v_mov_b32_e32 v26, v1
+; GFX90A-NEXT: v_mov_b32_e32 v27, v1
+; GFX90A-NEXT: v_mov_b32_e32 v28, v1
+; GFX90A-NEXT: v_mov_b32_e32 v29, v1
+; GFX90A-NEXT: v_mov_b32_e32 v30, v1
+; GFX90A-NEXT: v_mov_b32_e32 v31, v1
+; GFX90A-NEXT: v_pk_mov_b32 v[32:33], v[30:31], v[30:31] op_sel:[0,1]
+; GFX90A-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX90A-NEXT: v_pk_mov_b32 v[30:31], v[28:29], v[28:29] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[28:29], v[26:27], v[26:27] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[26:27], v[24:25], v[24:25] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[24:25], v[22:23], v[22:23] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[22:23], v[20:21], v[20:21] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[20:21], v[18:19], v[18:19] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[18:19], v[16:17], v[16:17] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[16:17], v[14:15], v[14:15] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[14:15], v[12:13], v[12:13] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[10:11], v[10:11] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[10:11], v[8:9], v[8:9] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v0, v34, v[2:33]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm:
; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a8, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a9, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a10, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a11, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a12, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a13, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a14, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a15, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a16, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a17, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a18, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a19, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a20, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a21, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a22, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a23, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a24, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a25, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a26, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a27, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a28, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a29, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a30, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a31, a1
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, v1
+; GFX942-NEXT: v_mov_b32_e32 v5, v1
+; GFX942-NEXT: v_mov_b32_e32 v6, v1
+; GFX942-NEXT: v_mov_b32_e32 v7, v1
+; GFX942-NEXT: v_mov_b32_e32 v8, v1
+; GFX942-NEXT: v_mov_b32_e32 v9, v1
+; GFX942-NEXT: v_mov_b32_e32 v10, v1
+; GFX942-NEXT: v_mov_b32_e32 v11, v1
+; GFX942-NEXT: v_mov_b32_e32 v12, v1
+; GFX942-NEXT: v_mov_b32_e32 v13, v1
+; GFX942-NEXT: v_mov_b32_e32 v14, v1
+; GFX942-NEXT: v_mov_b32_e32 v15, v1
+; GFX942-NEXT: v_mov_b32_e32 v16, v1
+; GFX942-NEXT: v_mov_b32_e32 v17, v1
+; GFX942-NEXT: v_mov_b32_e32 v18, v1
+; GFX942-NEXT: v_mov_b32_e32 v19, v1
+; GFX942-NEXT: v_mov_b32_e32 v20, v1
+; GFX942-NEXT: v_mov_b32_e32 v21, v1
+; GFX942-NEXT: v_mov_b32_e32 v22, v1
+; GFX942-NEXT: v_mov_b32_e32 v23, v1
+; GFX942-NEXT: v_mov_b32_e32 v24, v1
+; GFX942-NEXT: v_mov_b32_e32 v25, v1
+; GFX942-NEXT: v_mov_b32_e32 v26, v1
+; GFX942-NEXT: v_mov_b32_e32 v27, v1
+; GFX942-NEXT: v_mov_b32_e32 v28, v1
+; GFX942-NEXT: v_mov_b32_e32 v29, v1
+; GFX942-NEXT: v_mov_b32_e32 v30, v1
+; GFX942-NEXT: v_mov_b32_e32 v31, v1
+; GFX942-NEXT: v_mov_b64_e32 v[32:33], v[30:31]
+; GFX942-NEXT: v_mov_b32_e32 v34, 2.0
+; GFX942-NEXT: v_mov_b64_e32 v[30:31], v[28:29]
+; GFX942-NEXT: v_mov_b64_e32 v[28:29], v[26:27]
+; GFX942-NEXT: v_mov_b64_e32 v[26:27], v[24:25]
+; GFX942-NEXT: v_mov_b64_e32 v[24:25], v[22:23]
+; GFX942-NEXT: v_mov_b64_e32 v[22:23], v[20:21]
+; GFX942-NEXT: v_mov_b64_e32 v[20:21], v[18:19]
+; GFX942-NEXT: v_mov_b64_e32 v[18:19], v[16:17]
+; GFX942-NEXT: v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-NEXT: v_mov_b64_e32 v[14:15], v[12:13]
+; GFX942-NEXT: v_mov_b64_e32 v[12:13], v[10:11]
+; GFX942-NEXT: v_mov_b64_e32 v[10:11], v[8:9]
+; GFX942-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31]
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm:
@@ -5659,38 +5552,38 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %ar
;
; GFX90A-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
; GFX90A: ; %bb.0: ; %bb
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f60000
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 1.0
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3]
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v5, v6, v[0:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 3
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f60000
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v5, 1.0
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3]
+; GFX942-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat:
@@ -5768,38 +5661,36 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspa
;
; GFX90A-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
; GFX90A: ; %bb.0: ; %bb
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f60000
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 1.0
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3]
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v5, v6, v[0:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 3
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
; GFX942: ; %bb.0: ; %bb
-; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f60000
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v1
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v5, 1.0
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3]
+; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3]
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code:
@@ -6022,60 +5913,60 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg
; GFX90A: ; %bb.0: ; %bb
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v34, 2.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX90A-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX90A-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX90A-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX90A-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX90A-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX90A-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX90A-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX90A-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX90A-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX90A-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX90A-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX90A-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX90A-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX90A-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_f32_32x32x1f32_vecarg:
; GFX942: ; %bb.0: ; %bb
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v2, 2.0
+; GFX942-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v34, 2.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112
-; GFX942-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96
-; GFX942-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80
-; GFX942-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64
-; GFX942-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48
-; GFX942-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32
-; GFX942-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16
-; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1]
+; GFX942-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX942-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX942-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX942-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX942-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX942-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX942-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX942-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
; GFX942-NEXT: s_endpgm
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_vecarg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index aae14c8..d9359c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -17,17 +17,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -40,17 +30,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_1_1__cbsz1__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -63,17 +43,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_2__cbsz1__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -86,17 +56,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_3__cbsz1__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -109,17 +69,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_3__cbsz1__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -132,17 +82,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_0__cbsz1__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -155,17 +95,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_2_3__cbsz1__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -178,17 +108,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_3_2__cbsz1__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -202,17 +122,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -226,17 +136,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -250,17 +150,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -274,17 +164,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -298,17 +178,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -322,17 +192,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -346,17 +206,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -370,17 +220,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -394,17 +234,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15] blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -418,17 +248,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -442,17 +262,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -466,17 +276,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:1 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -491,17 +291,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:1 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:1 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -515,17 +305,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -538,17 +318,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] cbsz:1 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -562,17 +332,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:1 blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -586,17 +346,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3] cbsz:1 blgp:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17] cbsz:1 blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -610,17 +360,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4(<8 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:1 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -634,17 +374,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:11], a[0:3] cbsz:1 blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:11], v[12:15] cbsz:1 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 1, ; cbsz
@@ -658,17 +388,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -682,17 +402,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -706,17 +416,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:2 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -730,17 +430,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:2 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:2 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -754,17 +444,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -778,17 +458,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:2 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -802,17 +472,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:2 blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -826,17 +486,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:2 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:2 blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -851,17 +501,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -875,17 +515,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -899,17 +529,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:3 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -923,17 +543,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3] cbsz:3 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17] cbsz:3 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -947,17 +557,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -971,17 +571,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:3 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -995,17 +585,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:3 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -1019,17 +599,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:3 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13] cbsz:3 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -1043,17 +613,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:3 blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -1067,17 +627,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:11], a[0:3] cbsz:3 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:11], v[12:15] cbsz:3 blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 3, ; cbsz
@@ -1091,17 +641,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4(<6 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:2 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -1115,17 +655,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:9], a[0:3] cbsz:2 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:9], v[10:13] cbsz:2 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -1139,17 +669,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0(<4 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1163,17 +683,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15] cbsz:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1187,17 +697,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1(<4 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15], v16, v17 op_sel_hi:[0,0,0] cbsz:4 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1211,17 +711,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:11], a[0:3] cbsz:4 blgp:1
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:11], v[12:15] cbsz:4 blgp:1
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1235,17 +725,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2(<4 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1259,17 +739,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13] cbsz:4 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1283,17 +753,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3(<4 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13], v14, v15 op_sel_hi:[0,0,0] cbsz:4 blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1307,17 +767,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:9], a[0:3] cbsz:4 blgp:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:9], v[10:13] cbsz:4 blgp:3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1331,17 +781,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4(<4 x
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:7], v[8:11], v12, v13 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1355,17 +795,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:4 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -1382,19 +812,10 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_mov_b32_e32 v16, s0
-; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v20, s0
+; GCN-NEXT: v_mov_b32_e32 v21, s1
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
@@ -1404,18 +825,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: v_mov_b32_e32 v21, s0
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v20 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
@@ -1425,18 +837,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: v_mov_b32_e32 v21, s0
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v16 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
@@ -1446,35 +849,28 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v14, s0
-; SDAG-NEXT: v_mov_b32_e32 v15, s1
-; SDAG-NEXT: v_mov_b32_e32 v16, s2
-; SDAG-NEXT: v_mov_b32_e32 v17, s3
-; SDAG-NEXT: v_mov_b32_e32 v18, s16
-; SDAG-NEXT: v_mov_b32_e32 v19, s17
-; SDAG-NEXT: v_mov_b32_e32 v20, s18
-; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s0
+; SDAG-NEXT: v_mov_b32_e32 v17, s1
+; SDAG-NEXT: v_mov_b32_e32 v18, s2
+; SDAG-NEXT: v_mov_b32_e32 v19, s3
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v4, s28
; SDAG-NEXT: v_mov_b32_e32 v5, s29
-; SDAG-NEXT: v_mov_b32_e32 v6, s20
-; SDAG-NEXT: v_mov_b32_e32 v7, s21
-; SDAG-NEXT: v_mov_b32_e32 v8, s22
-; SDAG-NEXT: v_mov_b32_e32 v9, s23
-; SDAG-NEXT: v_mov_b32_e32 v10, s24
-; SDAG-NEXT: v_mov_b32_e32 v11, s25
-; SDAG-NEXT: v_mov_b32_e32 v12, s26
-; SDAG-NEXT: v_mov_b32_e32 v13, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v4
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v5
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[6:13], a[0:3], v2, v3 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[16:23], v[8:15], v[4:7], v2, v3 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs:
@@ -1488,23 +884,16 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b32_e32 v22, v0
+; GISEL-NEXT: v_mov_b32_e32 v23, v1
; GISEL-NEXT: v_mov_b32_e32 v20, s28
; GISEL-NEXT: v_mov_b32_e32 v21, s29
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[4:11], v[12:19], a[0:3], v2, v3 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[4:11], v[12:19], v[20:23], v2, v3 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
@@ -1522,18 +911,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v13, v12 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
@@ -1547,18 +927,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: v_mov_b32_e32 v13, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v8, v12 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v13, v12 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
@@ -1576,18 +947,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v12, v13 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
@@ -1601,18 +963,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: v_mov_b32_e32 v13, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[14:21], v[0:7], a[0:3], v12, v8 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[8:11], v12, v13 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
@@ -1630,18 +983,9 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v19, s17
; SDAG-NEXT: v_mov_b32_e32 v20, s18
; SDAG-NEXT: v_mov_b32_e32 v21, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v13, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[14:21], v[8:11], v12, v13 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
@@ -1655,40 +999,36 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: v_mov_b32_e32 v13, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[14:21], a[0:3], v12, v8 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[14:21], v[8:11], v12, v13 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
}
define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
-; GCN-NEXT: v_mov_b32_e32 v17, s16
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v18, s0
+; SDAG-NEXT: v_mov_b32_e32 v19, s1
+; SDAG-NEXT: v_mov_b32_e32 v20, s2
+; SDAG-NEXT: v_mov_b32_e32 v21, s3
+; SDAG-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[18:21], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v17, s16
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[18:21], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
}
@@ -1697,26 +1037,21 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v10, s0
-; SDAG-NEXT: v_mov_b32_e32 v11, s1
-; SDAG-NEXT: v_mov_b32_e32 v12, s2
-; SDAG-NEXT: v_mov_b32_e32 v13, s3
-; SDAG-NEXT: v_mov_b32_e32 v14, s16
-; SDAG-NEXT: v_mov_b32_e32 v15, s17
-; SDAG-NEXT: v_mov_b32_e32 v16, s18
-; SDAG-NEXT: v_mov_b32_e32 v17, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s23
+; SDAG-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-NEXT: v_mov_b32_e32 v13, s23
; SDAG-NEXT: v_mov_b32_e32 v9, s24
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[14:21], v[0:7], v[10:13], v8, v9 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
@@ -1730,18 +1065,11 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s23
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[22:23]
; GISEL-NEXT: v_mov_b32_e32 v9, s24
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[10:17], v[0:7], a[0:3], v8, v9 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[10:17], v[0:7], v[18:21], v8, v9 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <4 x float> %result
@@ -1753,35 +1081,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v20, -2
; SDAG-NEXT: v_mov_b32_e32 v21, 33
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, 33
-; GISEL-NEXT: v_mov_b32_e32 v17, -2
+; GISEL-NEXT: v_mov_b32_e32 v20, 33
+; GISEL-NEXT: v_mov_b32_e32 v21, -2
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
ret <4 x float> %result
@@ -1793,35 +1103,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v20, -2
; SDAG-NEXT: v_mov_b32_e32 v21, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v17, -2
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v21, -2
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
ret <4 x float> %result
@@ -1833,35 +1125,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v20, 0x4d
; SDAG-NEXT: v_mov_b32_e32 v21, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[1,1,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v17, 0x4d
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v21, 0x4d
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[1,1,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
ret <4 x float> %result
@@ -2188,17 +1462,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <4 x float> %result
@@ -2209,17 +1473,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19]
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
ret <4 x float> %result
@@ -2231,35 +1485,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1(<8
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v20, 1
; SDAG-NEXT: v_mov_b32_e32 v21, 0
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, 0
-; GISEL-NEXT: v_mov_b32_e32 v17, 1
+; GISEL-NEXT: v_mov_b32_e32 v20, 0
+; GISEL-NEXT: v_mov_b32_e32 v21, 1
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
ret <4 x float> %result
@@ -2271,35 +1507,17 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a(
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-NEXT: v_mov_b32_e32 v20, 0
; SDAG-NEXT: v_mov_b32_e32 v21, 1
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v21, v20 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v20 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_1_0_a:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_mov_b32_e32 v16, 1
-; GISEL-NEXT: v_mov_b32_e32 v17, 0
+; GISEL-NEXT: v_mov_b32_e32 v20, 1
+; GISEL-NEXT: v_mov_b32_e32 v21, 0
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v16, v17 op_sel_hi:[0,0,0]
-; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
+; GISEL-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
ret <4 x float> %result
@@ -2313,17 +1531,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp6:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -2336,17 +1544,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -2359,17 +1557,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -2382,17 +1570,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6_
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:2 blgp:2
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:2 blgp:2
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 2, ; cbsz
@@ -2405,17 +1583,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v8i32_fp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -2428,17 +1596,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -2451,17 +1609,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp8__v6i32_fp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] blgp:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <4 x float> %arg2,
i32 0, ; cbsz
@@ -2474,17 +1622,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v6i32_fp4__v8i32_fp8:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:5], v[6:13], a[0:3], v18, v19 op_sel_hi:[0,0,0] cbsz:4
-; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:5], v[6:13], v[14:17], v18, v19 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -2497,17 +1635,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4(
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v20, v21 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
@@ -2520,17 +1648,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4_
; GCN-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3] cbsz:4 blgp:4
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19] cbsz:4 blgp:4
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2,
i32 4, ; cbsz
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index f0205a3..978284e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -17,89 +17,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -112,89 +80,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_1_1__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,1,0] op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -207,89 +143,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_2__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -302,89 +206,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -397,89 +269,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[0,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[0,1,0] op_sel_hi:[0,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -492,89 +332,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_0__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -587,89 +395,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_2_3__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[0,1,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[0,1,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -682,89 +458,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_3_2__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel:[1,0,0] op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel:[1,0,0] op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -778,43 +522,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -828,89 +556,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -923,43 +619,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] blgp:1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -974,43 +654,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2(<8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:2
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] blgp:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -1023,42 +686,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:2
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] blgp:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -1073,43 +719,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3(<8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:3
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] blgp:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -1122,42 +751,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] blgp:3
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] blgp:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -1171,42 +783,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4(<8 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -1219,42 +814,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz0__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] blgp:4
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27] blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -1268,89 +846,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1363,43 +909,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1413,89 +943,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1(<8 x
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:1 blgp:1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:1 blgp:1
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1509,43 +1007,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:1 blgp:1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:1 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1560,43 +1042,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2(<8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1609,42 +1074,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp2__constant_scale_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:2
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] cbsz:1 blgp:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1659,43 +1107,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3(<8 x
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:1 blgp:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1708,42 +1139,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15] cbsz:1 blgp:3
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29] cbsz:1 blgp:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1757,42 +1171,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4(<8 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:1 blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1805,42 +1202,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz1__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:11], a[0:15] cbsz:1 blgp:4
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:7], v[8:11], v[12:27] cbsz:1 blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v4i32(<8 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 1, ; cbsz
@@ -1855,43 +1235,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0(<6 x
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -1904,42 +1267,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:2
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -1954,43 +1300,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1(<6 x
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -2003,42 +1332,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:2 blgp:1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:2 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -2052,41 +1364,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2(<6 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -2099,41 +1394,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:2
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:2 blgp:2
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -2147,41 +1425,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3(<6 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:2 blgp:3
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -2194,41 +1455,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:2 blgp:3
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:2 blgp:3
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -2244,43 +1488,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0(<6 x
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2293,42 +1520,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:3
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2343,43 +1553,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1(<6 x
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:3 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2392,42 +1585,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15] cbsz:3 blgp:1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29] cbsz:3 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2441,41 +1617,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2(<6 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:2
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2488,41 +1647,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:2
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:3 blgp:2
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2536,41 +1678,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4(<6 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:3 blgp:4
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v10
+; GCN-NEXT: v_mov_b32_e32 v1, v11
+; GCN-NEXT: v_mov_b32_e32 v2, v12
+; GCN-NEXT: v_mov_b32_e32 v3, v13
+; GCN-NEXT: v_mov_b32_e32 v4, v14
+; GCN-NEXT: v_mov_b32_e32 v5, v15
+; GCN-NEXT: v_mov_b32_e32 v6, v16
+; GCN-NEXT: v_mov_b32_e32 v7, v17
+; GCN-NEXT: v_mov_b32_e32 v8, v18
+; GCN-NEXT: v_mov_b32_e32 v9, v19
+; GCN-NEXT: v_mov_b32_e32 v10, v20
+; GCN-NEXT: v_mov_b32_e32 v11, v21
+; GCN-NEXT: v_mov_b32_e32 v12, v22
+; GCN-NEXT: v_mov_b32_e32 v13, v23
+; GCN-NEXT: v_mov_b32_e32 v14, v24
+; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2583,41 +1708,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:3 blgp:4
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25] cbsz:3 blgp:4
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v10
+; GCN-NEXT: v_mov_b32_e32 v1, v11
+; GCN-NEXT: v_mov_b32_e32 v2, v12
+; GCN-NEXT: v_mov_b32_e32 v3, v13
+; GCN-NEXT: v_mov_b32_e32 v4, v14
+; GCN-NEXT: v_mov_b32_e32 v5, v15
+; GCN-NEXT: v_mov_b32_e32 v6, v16
+; GCN-NEXT: v_mov_b32_e32 v7, v17
+; GCN-NEXT: v_mov_b32_e32 v8, v18
+; GCN-NEXT: v_mov_b32_e32 v9, v19
+; GCN-NEXT: v_mov_b32_e32 v10, v20
+; GCN-NEXT: v_mov_b32_e32 v11, v21
+; GCN-NEXT: v_mov_b32_e32 v12, v22
+; GCN-NEXT: v_mov_b32_e32 v13, v23
+; GCN-NEXT: v_mov_b32_e32 v14, v24
+; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2631,41 +1739,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3(<6 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:3 blgp:3
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2678,41 +1769,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz3__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:11], a[0:15] cbsz:3 blgp:3
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:5], v[6:11], v[12:27] cbsz:3 blgp:3
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v6i32(<6 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 3, ; cbsz
@@ -2726,41 +1800,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4(<6 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:2 blgp:4
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v10
+; GCN-NEXT: v_mov_b32_e32 v1, v11
+; GCN-NEXT: v_mov_b32_e32 v2, v12
+; GCN-NEXT: v_mov_b32_e32 v3, v13
+; GCN-NEXT: v_mov_b32_e32 v4, v14
+; GCN-NEXT: v_mov_b32_e32 v5, v15
+; GCN-NEXT: v_mov_b32_e32 v6, v16
+; GCN-NEXT: v_mov_b32_e32 v7, v17
+; GCN-NEXT: v_mov_b32_e32 v8, v18
+; GCN-NEXT: v_mov_b32_e32 v9, v19
+; GCN-NEXT: v_mov_b32_e32 v10, v20
+; GCN-NEXT: v_mov_b32_e32 v11, v21
+; GCN-NEXT: v_mov_b32_e32 v12, v22
+; GCN-NEXT: v_mov_b32_e32 v13, v23
+; GCN-NEXT: v_mov_b32_e32 v14, v24
+; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -2773,41 +1830,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz2__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:9], a[0:15] cbsz:2 blgp:4
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:5], v[6:9], v[10:25] cbsz:2 blgp:4
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v10
+; GCN-NEXT: v_mov_b32_e32 v1, v11
+; GCN-NEXT: v_mov_b32_e32 v2, v12
+; GCN-NEXT: v_mov_b32_e32 v3, v13
+; GCN-NEXT: v_mov_b32_e32 v4, v14
+; GCN-NEXT: v_mov_b32_e32 v5, v15
+; GCN-NEXT: v_mov_b32_e32 v6, v16
+; GCN-NEXT: v_mov_b32_e32 v7, v17
+; GCN-NEXT: v_mov_b32_e32 v8, v18
+; GCN-NEXT: v_mov_b32_e32 v9, v19
+; GCN-NEXT: v_mov_b32_e32 v10, v20
+; GCN-NEXT: v_mov_b32_e32 v11, v21
+; GCN-NEXT: v_mov_b32_e32 v12, v22
+; GCN-NEXT: v_mov_b32_e32 v13, v23
+; GCN-NEXT: v_mov_b32_e32 v14, v24
+; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v4i32(<6 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -2821,42 +1861,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0(<4 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -2869,42 +1892,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp0__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27] cbsz:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -2918,42 +1924,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1(<4 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27], v28, v29 op_sel_hi:[0,0,0] cbsz:4 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -2966,42 +1955,25 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp1__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v12
-; GCN-NEXT: v_accvgpr_write_b32 a1, v13
-; GCN-NEXT: v_accvgpr_write_b32 a2, v14
-; GCN-NEXT: v_accvgpr_write_b32 a3, v15
-; GCN-NEXT: v_accvgpr_write_b32 a4, v16
-; GCN-NEXT: v_accvgpr_write_b32 a5, v17
-; GCN-NEXT: v_accvgpr_write_b32 a6, v18
-; GCN-NEXT: v_accvgpr_write_b32 a7, v19
-; GCN-NEXT: v_accvgpr_write_b32 a8, v20
-; GCN-NEXT: v_accvgpr_write_b32 a9, v21
-; GCN-NEXT: v_accvgpr_write_b32 a10, v22
-; GCN-NEXT: v_accvgpr_write_b32 a11, v23
-; GCN-NEXT: v_accvgpr_write_b32 a12, v24
-; GCN-NEXT: v_accvgpr_write_b32 a13, v25
-; GCN-NEXT: v_accvgpr_write_b32 a14, v26
-; GCN-NEXT: v_accvgpr_write_b32 a15, v27
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:11], a[0:15] cbsz:4 blgp:1
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[12:27], v[0:3], v[4:11], v[12:27] cbsz:4 blgp:1
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v12
+; GCN-NEXT: v_mov_b32_e32 v1, v13
+; GCN-NEXT: v_mov_b32_e32 v2, v14
+; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_mov_b32_e32 v4, v16
+; GCN-NEXT: v_mov_b32_e32 v5, v17
+; GCN-NEXT: v_mov_b32_e32 v6, v18
+; GCN-NEXT: v_mov_b32_e32 v7, v19
+; GCN-NEXT: v_mov_b32_e32 v8, v20
+; GCN-NEXT: v_mov_b32_e32 v9, v21
+; GCN-NEXT: v_mov_b32_e32 v10, v22
+; GCN-NEXT: v_mov_b32_e32 v11, v23
+; GCN-NEXT: v_mov_b32_e32 v12, v24
+; GCN-NEXT: v_mov_b32_e32 v13, v25
+; GCN-NEXT: v_mov_b32_e32 v14, v26
+; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v8i32(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -3015,41 +1987,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2(<4 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:2
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v10
+; GCN-NEXT: v_mov_b32_e32 v1, v11
+; GCN-NEXT: v_mov_b32_e32 v2, v12
+; GCN-NEXT: v_mov_b32_e32 v3, v13
+; GCN-NEXT: v_mov_b32_e32 v4, v14
+; GCN-NEXT: v_mov_b32_e32 v5, v15
+; GCN-NEXT: v_mov_b32_e32 v6, v16
+; GCN-NEXT: v_mov_b32_e32 v7, v17
+; GCN-NEXT: v_mov_b32_e32 v8, v18
+; GCN-NEXT: v_mov_b32_e32 v9, v19
+; GCN-NEXT: v_mov_b32_e32 v10, v20
+; GCN-NEXT: v_mov_b32_e32 v11, v21
+; GCN-NEXT: v_mov_b32_e32 v12, v22
+; GCN-NEXT: v_mov_b32_e32 v13, v23
+; GCN-NEXT: v_mov_b32_e32 v14, v24
+; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -3062,41 +2017,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp2__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:2
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25] cbsz:4 blgp:2
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v10
+; GCN-NEXT: v_mov_b32_e32 v1, v11
+; GCN-NEXT: v_mov_b32_e32 v2, v12
+; GCN-NEXT: v_mov_b32_e32 v3, v13
+; GCN-NEXT: v_mov_b32_e32 v4, v14
+; GCN-NEXT: v_mov_b32_e32 v5, v15
+; GCN-NEXT: v_mov_b32_e32 v6, v16
+; GCN-NEXT: v_mov_b32_e32 v7, v17
+; GCN-NEXT: v_mov_b32_e32 v8, v18
+; GCN-NEXT: v_mov_b32_e32 v9, v19
+; GCN-NEXT: v_mov_b32_e32 v10, v20
+; GCN-NEXT: v_mov_b32_e32 v11, v21
+; GCN-NEXT: v_mov_b32_e32 v12, v22
+; GCN-NEXT: v_mov_b32_e32 v13, v23
+; GCN-NEXT: v_mov_b32_e32 v14, v24
+; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -3110,41 +2048,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3(<4 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25], v26, v27 op_sel_hi:[0,0,0] cbsz:4 blgp:3
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v10
+; GCN-NEXT: v_mov_b32_e32 v1, v11
+; GCN-NEXT: v_mov_b32_e32 v2, v12
+; GCN-NEXT: v_mov_b32_e32 v3, v13
+; GCN-NEXT: v_mov_b32_e32 v4, v14
+; GCN-NEXT: v_mov_b32_e32 v5, v15
+; GCN-NEXT: v_mov_b32_e32 v6, v16
+; GCN-NEXT: v_mov_b32_e32 v7, v17
+; GCN-NEXT: v_mov_b32_e32 v8, v18
+; GCN-NEXT: v_mov_b32_e32 v9, v19
+; GCN-NEXT: v_mov_b32_e32 v10, v20
+; GCN-NEXT: v_mov_b32_e32 v11, v21
+; GCN-NEXT: v_mov_b32_e32 v12, v22
+; GCN-NEXT: v_mov_b32_e32 v13, v23
+; GCN-NEXT: v_mov_b32_e32 v14, v24
+; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -3157,41 +2078,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp3__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v10
-; GCN-NEXT: v_accvgpr_write_b32 a1, v11
-; GCN-NEXT: v_accvgpr_write_b32 a2, v12
-; GCN-NEXT: v_accvgpr_write_b32 a3, v13
-; GCN-NEXT: v_accvgpr_write_b32 a4, v14
-; GCN-NEXT: v_accvgpr_write_b32 a5, v15
-; GCN-NEXT: v_accvgpr_write_b32 a6, v16
-; GCN-NEXT: v_accvgpr_write_b32 a7, v17
-; GCN-NEXT: v_accvgpr_write_b32 a8, v18
-; GCN-NEXT: v_accvgpr_write_b32 a9, v19
-; GCN-NEXT: v_accvgpr_write_b32 a10, v20
-; GCN-NEXT: v_accvgpr_write_b32 a11, v21
-; GCN-NEXT: v_accvgpr_write_b32 a12, v22
-; GCN-NEXT: v_accvgpr_write_b32 a13, v23
-; GCN-NEXT: v_accvgpr_write_b32 a14, v24
-; GCN-NEXT: v_accvgpr_write_b32 a15, v25
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:9], a[0:15] cbsz:4 blgp:3
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[10:25], v[0:3], v[4:9], v[10:25] cbsz:4 blgp:3
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v10
+; GCN-NEXT: v_mov_b32_e32 v1, v11
+; GCN-NEXT: v_mov_b32_e32 v2, v12
+; GCN-NEXT: v_mov_b32_e32 v3, v13
+; GCN-NEXT: v_mov_b32_e32 v4, v14
+; GCN-NEXT: v_mov_b32_e32 v5, v15
+; GCN-NEXT: v_mov_b32_e32 v6, v16
+; GCN-NEXT: v_mov_b32_e32 v7, v17
+; GCN-NEXT: v_mov_b32_e32 v8, v18
+; GCN-NEXT: v_mov_b32_e32 v9, v19
+; GCN-NEXT: v_mov_b32_e32 v10, v20
+; GCN-NEXT: v_mov_b32_e32 v11, v21
+; GCN-NEXT: v_mov_b32_e32 v12, v22
+; GCN-NEXT: v_mov_b32_e32 v13, v23
+; GCN-NEXT: v_mov_b32_e32 v14, v24
+; GCN-NEXT: v_mov_b32_e32 v15, v25
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v6i32(<4 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -3205,41 +2109,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4(<4 x
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[0:3], v[4:7], v[8:23], v24, v25 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v8
+; GCN-NEXT: v_mov_b32_e32 v1, v9
+; GCN-NEXT: v_mov_b32_e32 v2, v10
+; GCN-NEXT: v_mov_b32_e32 v3, v11
+; GCN-NEXT: v_mov_b32_e32 v4, v12
+; GCN-NEXT: v_mov_b32_e32 v5, v13
+; GCN-NEXT: v_mov_b32_e32 v6, v14
+; GCN-NEXT: v_mov_b32_e32 v7, v15
+; GCN-NEXT: v_mov_b32_e32 v8, v16
+; GCN-NEXT: v_mov_b32_e32 v9, v17
+; GCN-NEXT: v_mov_b32_e32 v10, v18
+; GCN-NEXT: v_mov_b32_e32 v11, v19
+; GCN-NEXT: v_mov_b32_e32 v12, v20
+; GCN-NEXT: v_mov_b32_e32 v13, v21
+; GCN-NEXT: v_mov_b32_e32 v14, v22
+; GCN-NEXT: v_mov_b32_e32 v15, v23
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -3252,41 +2139,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__cons
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__cbsz4__blgp4__constant_scale_0_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, v8
-; GCN-NEXT: v_accvgpr_write_b32 a1, v9
-; GCN-NEXT: v_accvgpr_write_b32 a2, v10
-; GCN-NEXT: v_accvgpr_write_b32 a3, v11
-; GCN-NEXT: v_accvgpr_write_b32 a4, v12
-; GCN-NEXT: v_accvgpr_write_b32 a5, v13
-; GCN-NEXT: v_accvgpr_write_b32 a6, v14
-; GCN-NEXT: v_accvgpr_write_b32 a7, v15
-; GCN-NEXT: v_accvgpr_write_b32 a8, v16
-; GCN-NEXT: v_accvgpr_write_b32 a9, v17
-; GCN-NEXT: v_accvgpr_write_b32 a10, v18
-; GCN-NEXT: v_accvgpr_write_b32 a11, v19
-; GCN-NEXT: v_accvgpr_write_b32 a12, v20
-; GCN-NEXT: v_accvgpr_write_b32 a13, v21
-; GCN-NEXT: v_accvgpr_write_b32 a14, v22
-; GCN-NEXT: v_accvgpr_write_b32 a15, v23
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:4 blgp:4
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:4 blgp:4
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v8
+; GCN-NEXT: v_mov_b32_e32 v1, v9
+; GCN-NEXT: v_mov_b32_e32 v2, v10
+; GCN-NEXT: v_mov_b32_e32 v3, v11
+; GCN-NEXT: v_mov_b32_e32 v4, v12
+; GCN-NEXT: v_mov_b32_e32 v5, v13
+; GCN-NEXT: v_mov_b32_e32 v6, v14
+; GCN-NEXT: v_mov_b32_e32 v7, v15
+; GCN-NEXT: v_mov_b32_e32 v8, v16
+; GCN-NEXT: v_mov_b32_e32 v9, v17
+; GCN-NEXT: v_mov_b32_e32 v10, v18
+; GCN-NEXT: v_mov_b32_e32 v11, v19
+; GCN-NEXT: v_mov_b32_e32 v12, v20
+; GCN-NEXT: v_mov_b32_e32 v13, v21
+; GCN-NEXT: v_mov_b32_e32 v14, v22
+; GCN-NEXT: v_mov_b32_e32 v15, v23
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v4i32.v4i32(<4 x i32> %arg0, <4 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -3303,45 +2173,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__sgpr_scaleB:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: v_mov_b32_e32 v16, s0
-; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: v_mov_b32_e32 v32, s0
+; GCN-NEXT: v_mov_b32_e32 v33, s1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
@@ -3351,45 +2206,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__sgpr_scaleA__vgpr_scaleB:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GCN-NEXT: v_mov_b32_e32 v33, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v31 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
@@ -3399,45 +2239,30 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgpr_scaleA__sgpr_scaleB:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
-; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: scratch_load_dword v31, off, s32
+; GCN-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GCN-NEXT: v_mov_b32_e32 v33, s0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v16 op_sel_hi:[0,0,0]
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
@@ -3455,8 +2280,24 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_mov_b32_e32 v16, s28
-; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_mov_b32_e32 v16, v15
+; SDAG-NEXT: v_mov_b32_e32 v17, v14
+; SDAG-NEXT: v_mov_b32_e32 v15, v13
+; SDAG-NEXT: v_mov_b32_e32 v14, v12
+; SDAG-NEXT: v_mov_b32_e32 v13, v11
+; SDAG-NEXT: v_mov_b32_e32 v12, v10
+; SDAG-NEXT: v_mov_b32_e32 v11, v9
+; SDAG-NEXT: v_mov_b32_e32 v10, v8
+; SDAG-NEXT: v_mov_b32_e32 v9, v7
+; SDAG-NEXT: v_mov_b32_e32 v8, v6
+; SDAG-NEXT: v_mov_b32_e32 v7, v5
+; SDAG-NEXT: v_mov_b32_e32 v6, v4
+; SDAG-NEXT: v_mov_b32_e32 v5, v3
+; SDAG-NEXT: v_mov_b32_e32 v4, v2
+; SDAG-NEXT: v_mov_b32_e32 v3, v1
+; SDAG-NEXT: v_mov_b32_e32 v2, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, s28
+; SDAG-NEXT: v_mov_b32_e32 v1, s29
; SDAG-NEXT: v_mov_b32_e32 v18, s20
; SDAG-NEXT: v_mov_b32_e32 v19, s21
; SDAG-NEXT: v_mov_b32_e32 v20, s22
@@ -3465,42 +2306,8 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-NEXT: v_mov_b32_e32 v23, s25
; SDAG-NEXT: v_mov_b32_e32 v24, s26
; SDAG-NEXT: v_mov_b32_e32 v25, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v3
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v4
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v5
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v6
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v7
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[18:25], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v17, v16 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
@@ -3510,52 +2317,50 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v32, s28
-; GISEL-NEXT: v_mov_b32_e32 v33, s29
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[20:21]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v33
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v2
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v3
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v4
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v5
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v6
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v7
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27]
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v28, v10
+; GISEL-NEXT: v_mov_b32_e32 v29, v11
+; GISEL-NEXT: v_mov_b32_e32 v30, v12
+; GISEL-NEXT: v_mov_b32_e32 v31, v13
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v17, s29
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[32:39], v[16:31], v14, v15 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
@@ -3573,43 +2378,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v25, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v25, v24 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v8
+; SDAG-NEXT: v_mov_b32_e32 v1, v9
+; SDAG-NEXT: v_mov_b32_e32 v2, v10
+; SDAG-NEXT: v_mov_b32_e32 v3, v11
+; SDAG-NEXT: v_mov_b32_e32 v4, v12
+; SDAG-NEXT: v_mov_b32_e32 v5, v13
+; SDAG-NEXT: v_mov_b32_e32 v6, v14
+; SDAG-NEXT: v_mov_b32_e32 v7, v15
+; SDAG-NEXT: v_mov_b32_e32 v8, v16
+; SDAG-NEXT: v_mov_b32_e32 v9, v17
+; SDAG-NEXT: v_mov_b32_e32 v10, v18
+; SDAG-NEXT: v_mov_b32_e32 v11, v19
+; SDAG-NEXT: v_mov_b32_e32 v12, v20
+; SDAG-NEXT: v_mov_b32_e32 v13, v21
+; SDAG-NEXT: v_mov_b32_e32 v14, v22
+; SDAG-NEXT: v_mov_b32_e32 v15, v23
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgpr_vgpr:
@@ -3623,43 +2412,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__sgp
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: v_mov_b32_e32 v25, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v8, v24 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v25, v24 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v8
+; GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mov_b32_e32 v3, v11
+; GISEL-NEXT: v_mov_b32_e32 v4, v12
+; GISEL-NEXT: v_mov_b32_e32 v5, v13
+; GISEL-NEXT: v_mov_b32_e32 v6, v14
+; GISEL-NEXT: v_mov_b32_e32 v7, v15
+; GISEL-NEXT: v_mov_b32_e32 v8, v16
+; GISEL-NEXT: v_mov_b32_e32 v9, v17
+; GISEL-NEXT: v_mov_b32_e32 v10, v18
+; GISEL-NEXT: v_mov_b32_e32 v11, v19
+; GISEL-NEXT: v_mov_b32_e32 v12, v20
+; GISEL-NEXT: v_mov_b32_e32 v13, v21
+; GISEL-NEXT: v_mov_b32_e32 v14, v22
+; GISEL-NEXT: v_mov_b32_e32 v15, v23
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
@@ -3677,43 +2450,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v25, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v24, v25 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v8
+; SDAG-NEXT: v_mov_b32_e32 v1, v9
+; SDAG-NEXT: v_mov_b32_e32 v2, v10
+; SDAG-NEXT: v_mov_b32_e32 v3, v11
+; SDAG-NEXT: v_mov_b32_e32 v4, v12
+; SDAG-NEXT: v_mov_b32_e32 v5, v13
+; SDAG-NEXT: v_mov_b32_e32 v6, v14
+; SDAG-NEXT: v_mov_b32_e32 v7, v15
+; SDAG-NEXT: v_mov_b32_e32 v8, v16
+; SDAG-NEXT: v_mov_b32_e32 v9, v17
+; SDAG-NEXT: v_mov_b32_e32 v10, v18
+; SDAG-NEXT: v_mov_b32_e32 v11, v19
+; SDAG-NEXT: v_mov_b32_e32 v12, v20
+; SDAG-NEXT: v_mov_b32_e32 v13, v21
+; SDAG-NEXT: v_mov_b32_e32 v14, v22
+; SDAG-NEXT: v_mov_b32_e32 v15, v23
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgpr_sgpr:
@@ -3727,43 +2484,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_vgpr__vgp
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: v_mov_b32_e32 v25, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[26:33], v[0:7], v[8:23], v24, v25 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v8
+; GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mov_b32_e32 v3, v11
+; GISEL-NEXT: v_mov_b32_e32 v4, v12
+; GISEL-NEXT: v_mov_b32_e32 v5, v13
+; GISEL-NEXT: v_mov_b32_e32 v6, v14
+; GISEL-NEXT: v_mov_b32_e32 v7, v15
+; GISEL-NEXT: v_mov_b32_e32 v8, v16
+; GISEL-NEXT: v_mov_b32_e32 v9, v17
+; GISEL-NEXT: v_mov_b32_e32 v10, v18
+; GISEL-NEXT: v_mov_b32_e32 v11, v19
+; GISEL-NEXT: v_mov_b32_e32 v12, v20
+; GISEL-NEXT: v_mov_b32_e32 v13, v21
+; GISEL-NEXT: v_mov_b32_e32 v14, v22
+; GISEL-NEXT: v_mov_b32_e32 v15, v23
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
@@ -3781,43 +2522,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v23
-; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v25, s20
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[0:7], v[26:33], v[8:23], v24, v25 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v8
+; SDAG-NEXT: v_mov_b32_e32 v1, v9
+; SDAG-NEXT: v_mov_b32_e32 v2, v10
+; SDAG-NEXT: v_mov_b32_e32 v3, v11
+; SDAG-NEXT: v_mov_b32_e32 v4, v12
+; SDAG-NEXT: v_mov_b32_e32 v5, v13
+; SDAG-NEXT: v_mov_b32_e32 v6, v14
+; SDAG-NEXT: v_mov_b32_e32 v7, v15
+; SDAG-NEXT: v_mov_b32_e32 v8, v16
+; SDAG-NEXT: v_mov_b32_e32 v9, v17
+; SDAG-NEXT: v_mov_b32_e32 v10, v18
+; SDAG-NEXT: v_mov_b32_e32 v11, v19
+; SDAG-NEXT: v_mov_b32_e32 v12, v20
+; SDAG-NEXT: v_mov_b32_e32 v13, v21
+; SDAG-NEXT: v_mov_b32_e32 v14, v22
+; SDAG-NEXT: v_mov_b32_e32 v15, v23
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
@@ -3831,90 +2556,108 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v23
-; GISEL-NEXT: v_mov_b32_e32 v8, s20
+; GISEL-NEXT: v_mov_b32_e32 v25, s20
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[26:33], a[0:15], v24, v8 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[8:23], v[0:7], v[26:33], v[8:23], v24, v25 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v8
+; GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mov_b32_e32 v3, v11
+; GISEL-NEXT: v_mov_b32_e32 v4, v12
+; GISEL-NEXT: v_mov_b32_e32 v5, v13
+; GISEL-NEXT: v_mov_b32_e32 v6, v14
+; GISEL-NEXT: v_mov_b32_e32 v7, v15
+; GISEL-NEXT: v_mov_b32_e32 v8, v16
+; GISEL-NEXT: v_mov_b32_e32 v9, v17
+; GISEL-NEXT: v_mov_b32_e32 v10, v18
+; GISEL-NEXT: v_mov_b32_e32 v11, v19
+; GISEL-NEXT: v_mov_b32_e32 v12, v20
+; GISEL-NEXT: v_mov_b32_e32 v13, v21
+; GISEL-NEXT: v_mov_b32_e32 v14, v22
+; GISEL-NEXT: v_mov_b32_e32 v15, v23
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> inreg %arg2, i32 %scale0, i32 inreg %scale1) {
-; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
-; GCN-NEXT: v_accvgpr_write_b32 a4, s16
-; GCN-NEXT: v_accvgpr_write_b32 a5, s17
-; GCN-NEXT: v_accvgpr_write_b32 a6, s18
-; GCN-NEXT: v_accvgpr_write_b32 a7, s19
-; GCN-NEXT: v_accvgpr_write_b32 a8, s20
-; GCN-NEXT: v_accvgpr_write_b32 a9, s21
-; GCN-NEXT: v_accvgpr_write_b32 a10, s22
-; GCN-NEXT: v_accvgpr_write_b32 a11, s23
-; GCN-NEXT: v_accvgpr_write_b32 a12, s24
-; GCN-NEXT: v_accvgpr_write_b32 a13, s25
-; GCN-NEXT: v_accvgpr_write_b32 a14, s26
-; GCN-NEXT: v_accvgpr_write_b32 a15, s27
-; GCN-NEXT: v_mov_b32_e32 v17, s28
-; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v16, v17 op_sel_hi:[0,0,0]
-; GCN-NEXT: s_nop 15
-; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v33, v7
+; SDAG-NEXT: v_mov_b32_e32 v32, v6
+; SDAG-NEXT: v_mov_b32_e32 v31, v5
+; SDAG-NEXT: v_mov_b32_e32 v30, v4
+; SDAG-NEXT: v_mov_b32_e32 v29, v3
+; SDAG-NEXT: v_mov_b32_e32 v28, v2
+; SDAG-NEXT: v_mov_b32_e32 v27, v1
+; SDAG-NEXT: v_mov_b32_e32 v26, v0
+; SDAG-NEXT: v_mov_b32_e32 v25, v15
+; SDAG-NEXT: v_mov_b32_e32 v24, v14
+; SDAG-NEXT: v_mov_b32_e32 v23, v13
+; SDAG-NEXT: v_mov_b32_e32 v22, v12
+; SDAG-NEXT: v_mov_b32_e32 v21, v11
+; SDAG-NEXT: v_mov_b32_e32 v20, v10
+; SDAG-NEXT: v_mov_b32_e32 v19, v9
+; SDAG-NEXT: v_mov_b32_e32 v18, v8
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-NEXT: v_mov_b32_e32 v2, s2
+; SDAG-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-NEXT: v_mov_b32_e32 v4, s16
+; SDAG-NEXT: v_mov_b32_e32 v5, s17
+; SDAG-NEXT: v_mov_b32_e32 v6, s18
+; SDAG-NEXT: v_mov_b32_e32 v7, s19
+; SDAG-NEXT: v_mov_b32_e32 v8, s20
+; SDAG-NEXT: v_mov_b32_e32 v9, s21
+; SDAG-NEXT: v_mov_b32_e32 v10, s22
+; SDAG-NEXT: v_mov_b32_e32 v11, s23
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v13, s25
+; SDAG-NEXT: v_mov_b32_e32 v14, s26
+; SDAG-NEXT: v_mov_b32_e32 v15, s27
+; SDAG-NEXT: v_mov_b32_e32 v17, s28
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_vgpr_sgpr__vgpr_sgpr:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: s_mov_b32 s12, s0
+; GISEL-NEXT: s_mov_b32 s13, s1
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v28, v10
+; GISEL-NEXT: v_mov_b32_e32 v29, v11
+; GISEL-NEXT: v_mov_b32_e32 v30, v12
+; GISEL-NEXT: v_mov_b32_e32 v31, v13
+; GISEL-NEXT: v_mov_b32_e32 v32, v14
+; GISEL-NEXT: v_mov_b32_e32 v33, v15
+; GISEL-NEXT: s_mov_b32 s14, s2
+; GISEL-NEXT: s_mov_b32 s15, s3
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[26:27]
+; GISEL-NEXT: v_mov_b32_e32 v17, s28
+; GISEL-NEXT: s_nop 1
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v16, v17 op_sel_hi:[0,0,0]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
}
@@ -3931,52 +2674,34 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b32_e32 v20, s24
-; SDAG-NEXT: v_mov_b32_e32 v21, s25
-; SDAG-NEXT: v_mov_b32_e32 v22, s26
-; SDAG-NEXT: v_mov_b32_e32 v23, s27
-; SDAG-NEXT: v_mov_b32_e32 v24, s28
-; SDAG-NEXT: v_mov_b32_e32 v25, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT: v_mov_b32_e32 v16, v15
+; SDAG-NEXT: v_mov_b32_e32 v17, v14
+; SDAG-NEXT: v_mov_b32_e32 v15, v13
+; SDAG-NEXT: v_mov_b32_e32 v14, v12
+; SDAG-NEXT: v_mov_b32_e32 v13, v11
+; SDAG-NEXT: v_mov_b32_e32 v12, v10
+; SDAG-NEXT: v_mov_b32_e32 v11, v9
+; SDAG-NEXT: v_mov_b32_e32 v10, v8
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v4, s24
+; SDAG-NEXT: v_mov_b32_e32 v5, s25
+; SDAG-NEXT: v_mov_b32_e32 v6, s26
+; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v8, s28
+; SDAG-NEXT: v_mov_b32_e32 v9, s29
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[26:33], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
-; SDAG-NEXT: s_nop 15
-; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v17, v16 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
@@ -3986,56 +2711,46 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v24, s20
-; GISEL-NEXT: v_mov_b32_e32 v25, s21
-; GISEL-NEXT: v_mov_b32_e32 v26, s22
-; GISEL-NEXT: v_mov_b32_e32 v27, s23
-; GISEL-NEXT: v_mov_b32_e32 v28, s24
-; GISEL-NEXT: v_mov_b32_e32 v29, s25
-; GISEL-NEXT: v_mov_b32_e32 v30, s26
-; GISEL-NEXT: v_mov_b32_e32 v31, s27
-; GISEL-NEXT: v_mov_b32_e32 v32, s28
-; GISEL-NEXT: v_mov_b32_e32 v33, s29
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v30
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v31
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v32
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v33
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13]
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v28, v10
+; GISEL-NEXT: v_mov_b32_e32 v29, v11
+; GISEL-NEXT: v_mov_b32_e32 v30, v12
+; GISEL-NEXT: v_mov_b32_e32 v31, v13
+; GISEL-NEXT: v_mov_b32_e32 v16, s20
+; GISEL-NEXT: v_mov_b32_e32 v17, s21
+; GISEL-NEXT: v_mov_b32_e32 v18, s22
+; GISEL-NEXT: v_mov_b32_e32 v19, s23
+; GISEL-NEXT: v_mov_b32_e32 v20, s24
+; GISEL-NEXT: v_mov_b32_e32 v21, s25
+; GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GISEL-NEXT: v_mov_b32_e32 v23, s27
+; GISEL-NEXT: v_mov_b32_e32 v24, s28
+; GISEL-NEXT: v_mov_b32_e32 v25, s29
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[32:39], v[0:7], v[16:31], v14, v15 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
ret <16 x float> %result
@@ -4045,89 +2760,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, -2
-; SDAG-NEXT: v_mov_b32_e32 v32, 33
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v32, -2
+; SDAG-NEXT: v_mov_b32_e32 v33, 33
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_inlineimm__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 33
-; GISEL-NEXT: v_mov_b32_e32 v32, -2
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v32, 33
+; GISEL-NEXT: v_mov_b32_e32 v33, -2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 33, i32 2, i32 -2)
ret <16 x float> %result
@@ -4137,89 +2822,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, -2
-; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v32, -2
+; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v32, -2
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v33, -2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 -2)
ret <16 x float> %result
@@ -4229,89 +2884,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 1.0
-; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v32, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v33, 1.0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 1065353216)
ret <16 x float> %result
@@ -4321,89 +2946,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, -2
-; SDAG-NEXT: v_mov_b32_e32 v32, 1.0
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v32, -2
+; SDAG-NEXT: v_mov_b32_e32 v33, 1.0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_inlineimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 1.0
-; GISEL-NEXT: v_mov_b32_e32 v32, -2
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v33, -2
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1065353216, i32 2, i32 -2)
ret <16 x float> %result
@@ -4413,89 +3008,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal_
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 1.0
-; SDAG-NEXT: v_mov_b32_e32 v32, 0.15915494
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v32, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v33, 0.15915494
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_FP_literal__scaleB_FP_literal:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0.15915494
-; GISEL-NEXT: v_mov_b32_e32 v32, 1.0
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v32, 0.15915494
+; GISEL-NEXT: v_mov_b32_e32 v33, 1.0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 1042479491, i32 2, i32 1065353216)
ret <16 x float> %result
@@ -4505,89 +3070,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scale
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 0x4d
-; SDAG-NEXT: v_mov_b32_e32 v32, 0x41
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v32, 0x4d
+; SDAG-NEXT: v_mov_b32_e32 v33, 0x41
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[1,1,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[1,1,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__scaleA_kimm__scaleB_kimm:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0x41
-; GISEL-NEXT: v_mov_b32_e32 v32, 0x4d
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v32, 0x41
+; GISEL-NEXT: v_mov_b32_e32 v33, 0x4d
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[1,1,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[1,1,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 2, i32 65, i32 2, i32 77)
ret <16 x float> %result
@@ -4764,80 +3299,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s16
-; SDAG-NEXT: v_mov_b32_e32 v7, s17
-; SDAG-NEXT: v_mov_b32_e32 v8, s18
-; SDAG-NEXT: v_mov_b32_e32 v9, s19
-; SDAG-NEXT: v_mov_b32_e32 v10, s20
-; SDAG-NEXT: v_mov_b32_e32 v11, s21
-; SDAG-NEXT: v_mov_b32_e32 v12, s22
-; SDAG-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s16
+; SDAG-NEXT: v_mov_b32_e32 v23, s17
+; SDAG-NEXT: v_mov_b32_e32 v24, s18
+; SDAG-NEXT: v_mov_b32_e32 v25, s19
+; SDAG-NEXT: v_mov_b32_e32 v26, s20
+; SDAG-NEXT: v_mov_b32_e32 v27, s21
+; SDAG-NEXT: v_mov_b32_e32 v28, s22
+; SDAG-NEXT: v_mov_b32_e32 v29, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT: v_mov_b32_e32 v14, s24
-; SDAG-NEXT: v_mov_b32_e32 v15, s25
-; SDAG-NEXT: v_mov_b32_e32 v16, s26
+; SDAG-NEXT: v_mov_b32_e32 v30, s24
+; SDAG-NEXT: v_mov_b32_e32 v31, s25
+; SDAG-NEXT: v_mov_b32_e32 v32, s26
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b32_e32 v17, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v33, s27
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; SDAG-NEXT: v_mov_b32_e32 v16, s0
+; SDAG-NEXT: v_mov_b32_e32 v17, s1
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v0, v1 op_sel_hi:[0,0,0]
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mov_b32_e32 v18, s20
+; SDAG-NEXT: v_mov_b32_e32 v19, s21
+; SDAG-NEXT: v_mov_b32_e32 v20, s22
+; SDAG-NEXT: v_mov_b32_e32 v21, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48
+; SDAG-NEXT: global_store_dwordx4 v[16:17], v[18:21], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], 32
+; SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v24, s14
+; SDAG-NEXT: v_mov_b32_e32 v25, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s12
+; SDAG-NEXT: v_mov_b32_e32 v23, s13
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 16
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[22:25], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v10, s10
-; SDAG-NEXT: v_mov_b32_e32 v11, s11
-; SDAG-NEXT: v_mov_b32_e32 v8, s8
-; SDAG-NEXT: v_mov_b32_e32 v9, s9
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 0
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[18:19], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[16:17], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4846,63 +3373,55 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x80
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b32_e32 v20, s0
-; GISEL-NEXT: v_mov_b32_e32 v21, s1
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b32_e32 v36, s0
+; GISEL-NEXT: v_mov_b32_e32 v37, s1
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v36, v37 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 %scale0, i32 0, i32 %scale1)
@@ -4915,80 +3434,72 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT: v_mov_b32_e32 v0, 42
-; SDAG-NEXT: v_mov_b32_e32 v1, 25
+; SDAG-NEXT: v_mov_b32_e32 v16, 42
+; SDAG-NEXT: v_mov_b32_e32 v17, 25
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s16
-; SDAG-NEXT: v_mov_b32_e32 v7, s17
-; SDAG-NEXT: v_mov_b32_e32 v8, s18
-; SDAG-NEXT: v_mov_b32_e32 v9, s19
-; SDAG-NEXT: v_mov_b32_e32 v10, s20
-; SDAG-NEXT: v_mov_b32_e32 v11, s21
-; SDAG-NEXT: v_mov_b32_e32 v12, s22
-; SDAG-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-NEXT: v_mov_b32_e32 v18, s12
+; SDAG-NEXT: v_mov_b32_e32 v19, s13
+; SDAG-NEXT: v_mov_b32_e32 v20, s14
+; SDAG-NEXT: v_mov_b32_e32 v21, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s16
+; SDAG-NEXT: v_mov_b32_e32 v23, s17
+; SDAG-NEXT: v_mov_b32_e32 v24, s18
+; SDAG-NEXT: v_mov_b32_e32 v25, s19
+; SDAG-NEXT: v_mov_b32_e32 v26, s20
+; SDAG-NEXT: v_mov_b32_e32 v27, s21
+; SDAG-NEXT: v_mov_b32_e32 v28, s22
+; SDAG-NEXT: v_mov_b32_e32 v29, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT: v_mov_b32_e32 v14, s24
-; SDAG-NEXT: v_mov_b32_e32 v15, s25
-; SDAG-NEXT: v_mov_b32_e32 v16, s26
-; SDAG-NEXT: v_mov_b32_e32 v17, s27
+; SDAG-NEXT: v_mov_b32_e32 v30, s24
+; SDAG-NEXT: v_mov_b32_e32 v31, s25
+; SDAG-NEXT: v_mov_b32_e32 v32, s26
+; SDAG-NEXT: v_mov_b32_e32 v33, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[2:9], v[10:17], a[0:15], v1, v0 op_sel_hi:[0,0,0] blgp:2
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], 48
-; SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off sc0 sc1
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mov_b32_e32 v18, s20
+; SDAG-NEXT: v_mov_b32_e32 v19, s21
+; SDAG-NEXT: v_mov_b32_e32 v20, s22
+; SDAG-NEXT: v_mov_b32_e32 v21, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], 48
+; SDAG-NEXT: global_store_dwordx4 v[16:17], v[18:21], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v6, s18
-; SDAG-NEXT: v_mov_b32_e32 v7, s19
-; SDAG-NEXT: v_mov_b32_e32 v4, s16
-; SDAG-NEXT: v_mov_b32_e32 v5, s17
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], 32
-; SDAG-NEXT: global_store_dwordx4 v[2:3], v[4:7], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v22, s18
+; SDAG-NEXT: v_mov_b32_e32 v23, s19
+; SDAG-NEXT: v_mov_b32_e32 v20, s16
+; SDAG-NEXT: v_mov_b32_e32 v21, s17
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], 32
+; SDAG-NEXT: global_store_dwordx4 v[18:19], v[20:23], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], 16
-; SDAG-NEXT: global_store_dwordx4 v[4:5], v[6:9], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v24, s14
+; SDAG-NEXT: v_mov_b32_e32 v25, s15
+; SDAG-NEXT: v_mov_b32_e32 v22, s12
+; SDAG-NEXT: v_mov_b32_e32 v23, s13
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], 16
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[22:25], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v10, s10
-; SDAG-NEXT: v_mov_b32_e32 v11, s11
-; SDAG-NEXT: v_mov_b32_e32 v8, s8
-; SDAG-NEXT: v_mov_b32_e32 v9, s9
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], 0
-; SDAG-NEXT: global_store_dwordx4 v[6:7], v[8:11], off sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], 0
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[24:27], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[2:3], a[8:11], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[18:19], v[8:11], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[0:1], a[12:15], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[16:17], v[12:15], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[6:7], a[0:3], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[22:23], v[0:3], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v[4:5], a[4:7], off sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v[20:21], v[4:7], off sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4996,62 +3507,54 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
; GISEL: ; %bb.0:
; GISEL-NEXT: s_load_dwordx16 s[36:51], s[4:5], 0x0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GISEL-NEXT: v_mov_b32_e32 v20, 25
-; GISEL-NEXT: v_mov_b32_e32 v21, 42
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], 0
+; GISEL-NEXT: v_mov_b32_e32 v36, 25
+; GISEL-NEXT: v_mov_b32_e32 v37, 42
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], 16
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], 48
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v20, v21 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[36:37]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[38:39]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[40:41]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[42:43]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[44:45]
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[46:47]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[48:49]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[50:51]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], 32
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], 16
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], 48
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v36, v37 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], 32
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[16:19], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], v[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[20:23], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], v[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[24:27], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], v[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[28:31], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v[16:17], a[0:3], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[32:33], v[0:3], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[18:19], a[4:7], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[34:35], v[4:7], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[20:21], a[8:11], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[36:37], v[8:11], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v[22:23], a[12:15], off sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v[38:39], v[12:15], off sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 2, i32 0, i32 25, i32 0, i32 42)
@@ -5322,43 +3825,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <16 x float> %result
@@ -5368,43 +3855,27 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31]
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 3, i32 0, i32 1, i32 0)
ret <16 x float> %result
@@ -5414,89 +3885,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1(<8
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 1
-; SDAG-NEXT: v_mov_b32_e32 v32, 0
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v32, 1
+; SDAG-NEXT: v_mov_b32_e32 v33, 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_1:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 0
-; GISEL-NEXT: v_mov_b32_e32 v32, 1
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v32, 0
+; GISEL-NEXT: v_mov_b32_e32 v33, 1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1)
ret <16 x float> %result
@@ -5506,89 +3947,59 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: v_mov_b32_e32 v31, 0
-; SDAG-NEXT: v_mov_b32_e32 v32, 1
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: v_mov_b32_e32 v32, 0
+; SDAG-NEXT: v_mov_b32_e32 v33, 1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_1_0_a:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: v_mov_b32_e32 v31, 1
-; GISEL-NEXT: v_mov_b32_e32 v32, 0
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: v_mov_b32_e32 v32, 1
+; GISEL-NEXT: v_mov_b32_e32 v33, 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0)
ret <16 x float> %result
@@ -5602,89 +4013,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:2
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp6:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:2
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:2
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -5697,89 +4076,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:2
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:2
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -5792,87 +4139,55 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:2 blgp:2
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:2 blgp:2
; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -5885,42 +4200,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6_
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp6__v8i32_fp6__0_scale:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:2 blgp:2
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:2 blgp:2
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 2, ; cbsz
@@ -5933,89 +4232,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] blgp:4
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] blgp:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v8i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] blgp:4
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] blgp:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -6028,89 +4295,57 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:4
; SDAG-NEXT: s_nop 15
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp8:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:4
; GISEL-NEXT: s_nop 15
; GISEL-NEXT: s_nop 3
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -6124,43 +4359,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp8__v6i32_fp4(
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] blgp:4
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:7], v[8:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] blgp:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v6i32(<8 x i32> %arg0, <6 x i32> %arg1, <16 x float> %arg2,
i32 0, ; cbsz
@@ -6174,43 +4392,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v6i32_fp4__v8i32_fp8(
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: scratch_load_dword v31, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v14
-; GCN-NEXT: v_accvgpr_write_b32 a1, v15
-; GCN-NEXT: v_accvgpr_write_b32 a2, v16
-; GCN-NEXT: v_accvgpr_write_b32 a3, v17
-; GCN-NEXT: v_accvgpr_write_b32 a4, v18
-; GCN-NEXT: v_accvgpr_write_b32 a5, v19
-; GCN-NEXT: v_accvgpr_write_b32 a6, v20
-; GCN-NEXT: v_accvgpr_write_b32 a7, v21
-; GCN-NEXT: v_accvgpr_write_b32 a8, v22
-; GCN-NEXT: v_accvgpr_write_b32 a9, v23
-; GCN-NEXT: v_accvgpr_write_b32 a10, v24
-; GCN-NEXT: v_accvgpr_write_b32 a11, v25
-; GCN-NEXT: v_accvgpr_write_b32 a12, v26
-; GCN-NEXT: v_accvgpr_write_b32 a13, v27
-; GCN-NEXT: v_accvgpr_write_b32 a14, v28
-; GCN-NEXT: v_accvgpr_write_b32 a15, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:5], v[6:13], a[0:15], v30, v31 op_sel_hi:[0,0,0] cbsz:4
+; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[14:29], v[0:5], v[6:13], v[14:29], v30, v31 op_sel_hi:[0,0,0] cbsz:4
; GCN-NEXT: s_nop 15
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v14
+; GCN-NEXT: v_mov_b32_e32 v1, v15
+; GCN-NEXT: v_mov_b32_e32 v2, v16
+; GCN-NEXT: v_mov_b32_e32 v3, v17
+; GCN-NEXT: v_mov_b32_e32 v4, v18
+; GCN-NEXT: v_mov_b32_e32 v5, v19
+; GCN-NEXT: v_mov_b32_e32 v6, v20
+; GCN-NEXT: v_mov_b32_e32 v7, v21
+; GCN-NEXT: v_mov_b32_e32 v8, v22
+; GCN-NEXT: v_mov_b32_e32 v9, v23
+; GCN-NEXT: v_mov_b32_e32 v10, v24
+; GCN-NEXT: v_mov_b32_e32 v11, v25
+; GCN-NEXT: v_mov_b32_e32 v12, v26
+; GCN-NEXT: v_mov_b32_e32 v13, v27
+; GCN-NEXT: v_mov_b32_e32 v14, v28
+; GCN-NEXT: v_mov_b32_e32 v15, v29
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v6i32.v8i32(<6 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -6223,87 +4424,55 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4(
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: scratch_load_dword a15, off, s32
-; SDAG-NEXT: scratch_load_dword v31, off, s32 offset:8
-; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:4
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: scratch_load_dword v31, off, s32
+; SDAG-NEXT: scratch_load_dword v32, off, s32 offset:8
+; SDAG-NEXT: scratch_load_dword v33, off, s32 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v32, v31 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v33, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; SDAG-NEXT: s_nop 11
-; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
+; SDAG-NEXT: v_mov_b32_e32 v0, v16
+; SDAG-NEXT: v_mov_b32_e32 v1, v17
+; SDAG-NEXT: v_mov_b32_e32 v2, v18
+; SDAG-NEXT: v_mov_b32_e32 v3, v19
+; SDAG-NEXT: v_mov_b32_e32 v4, v20
+; SDAG-NEXT: v_mov_b32_e32 v5, v21
+; SDAG-NEXT: v_mov_b32_e32 v6, v22
+; SDAG-NEXT: v_mov_b32_e32 v7, v23
+; SDAG-NEXT: v_mov_b32_e32 v8, v24
+; SDAG-NEXT: v_mov_b32_e32 v9, v25
+; SDAG-NEXT: v_mov_b32_e32 v10, v26
+; SDAG-NEXT: v_mov_b32_e32 v11, v27
+; SDAG-NEXT: v_mov_b32_e32 v12, v28
+; SDAG-NEXT: v_mov_b32_e32 v13, v29
+; SDAG-NEXT: v_mov_b32_e32 v14, v30
+; SDAG-NEXT: v_mov_b32_e32 v15, v31
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: scratch_load_dword a15, off, s32
-; GISEL-NEXT: scratch_load_dword v31, off, s32 offset:4
-; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:8
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: scratch_load_dword v31, off, s32
+; GISEL-NEXT: scratch_load_dword v32, off, s32 offset:4
+; GISEL-NEXT: scratch_load_dword v33, off, s32 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], v31, v32 op_sel_hi:[0,0,0] cbsz:4 blgp:4
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31], v32, v33 op_sel_hi:[0,0,0] cbsz:4 blgp:4
; GISEL-NEXT: s_nop 11
-; GISEL-NEXT: v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT: v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT: v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT: v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT: v_accvgpr_read_b32 v4, a4
-; GISEL-NEXT: v_accvgpr_read_b32 v5, a5
-; GISEL-NEXT: v_accvgpr_read_b32 v6, a6
-; GISEL-NEXT: v_accvgpr_read_b32 v7, a7
-; GISEL-NEXT: v_accvgpr_read_b32 v8, a8
-; GISEL-NEXT: v_accvgpr_read_b32 v9, a9
-; GISEL-NEXT: v_accvgpr_read_b32 v10, a10
-; GISEL-NEXT: v_accvgpr_read_b32 v11, a11
-; GISEL-NEXT: v_accvgpr_read_b32 v12, a12
-; GISEL-NEXT: v_accvgpr_read_b32 v13, a13
-; GISEL-NEXT: v_accvgpr_read_b32 v14, a14
-; GISEL-NEXT: v_accvgpr_read_b32 v15, a15
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: v_mov_b32_e32 v2, v18
+; GISEL-NEXT: v_mov_b32_e32 v3, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v20
+; GISEL-NEXT: v_mov_b32_e32 v5, v21
+; GISEL-NEXT: v_mov_b32_e32 v6, v22
+; GISEL-NEXT: v_mov_b32_e32 v7, v23
+; GISEL-NEXT: v_mov_b32_e32 v8, v24
+; GISEL-NEXT: v_mov_b32_e32 v9, v25
+; GISEL-NEXT: v_mov_b32_e32 v10, v26
+; GISEL-NEXT: v_mov_b32_e32 v11, v27
+; GISEL-NEXT: v_mov_b32_e32 v12, v28
+; GISEL-NEXT: v_mov_b32_e32 v13, v29
+; GISEL-NEXT: v_mov_b32_e32 v14, v30
+; GISEL-NEXT: v_mov_b32_e32 v15, v31
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
@@ -6316,42 +4485,26 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4_
; GCN-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4___v8i32_fp4__v8i32_fp4__0_scale:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: scratch_load_dword a15, off, s32
-; GCN-NEXT: v_accvgpr_write_b32 a0, v16
-; GCN-NEXT: v_accvgpr_write_b32 a1, v17
-; GCN-NEXT: v_accvgpr_write_b32 a2, v18
-; GCN-NEXT: v_accvgpr_write_b32 a3, v19
-; GCN-NEXT: v_accvgpr_write_b32 a4, v20
-; GCN-NEXT: v_accvgpr_write_b32 a5, v21
-; GCN-NEXT: v_accvgpr_write_b32 a6, v22
-; GCN-NEXT: v_accvgpr_write_b32 a7, v23
-; GCN-NEXT: v_accvgpr_write_b32 a8, v24
-; GCN-NEXT: v_accvgpr_write_b32 a9, v25
-; GCN-NEXT: v_accvgpr_write_b32 a10, v26
-; GCN-NEXT: v_accvgpr_write_b32 a11, v27
-; GCN-NEXT: v_accvgpr_write_b32 a12, v28
-; GCN-NEXT: v_accvgpr_write_b32 a13, v29
-; GCN-NEXT: v_accvgpr_write_b32 a14, v30
+; GCN-NEXT: scratch_load_dword v31, off, s32
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15] cbsz:4 blgp:4
+; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 v[16:31], v[0:7], v[8:15], v[16:31] cbsz:4 blgp:4
; GCN-NEXT: s_nop 11
-; GCN-NEXT: v_accvgpr_read_b32 v0, a0
-; GCN-NEXT: v_accvgpr_read_b32 v1, a1
-; GCN-NEXT: v_accvgpr_read_b32 v2, a2
-; GCN-NEXT: v_accvgpr_read_b32 v3, a3
-; GCN-NEXT: v_accvgpr_read_b32 v4, a4
-; GCN-NEXT: v_accvgpr_read_b32 v5, a5
-; GCN-NEXT: v_accvgpr_read_b32 v6, a6
-; GCN-NEXT: v_accvgpr_read_b32 v7, a7
-; GCN-NEXT: v_accvgpr_read_b32 v8, a8
-; GCN-NEXT: v_accvgpr_read_b32 v9, a9
-; GCN-NEXT: v_accvgpr_read_b32 v10, a10
-; GCN-NEXT: v_accvgpr_read_b32 v11, a11
-; GCN-NEXT: v_accvgpr_read_b32 v12, a12
-; GCN-NEXT: v_accvgpr_read_b32 v13, a13
-; GCN-NEXT: v_accvgpr_read_b32 v14, a14
-; GCN-NEXT: v_accvgpr_read_b32 v15, a15
+; GCN-NEXT: v_mov_b32_e32 v0, v16
+; GCN-NEXT: v_mov_b32_e32 v1, v17
+; GCN-NEXT: v_mov_b32_e32 v2, v18
+; GCN-NEXT: v_mov_b32_e32 v3, v19
+; GCN-NEXT: v_mov_b32_e32 v4, v20
+; GCN-NEXT: v_mov_b32_e32 v5, v21
+; GCN-NEXT: v_mov_b32_e32 v6, v22
+; GCN-NEXT: v_mov_b32_e32 v7, v23
+; GCN-NEXT: v_mov_b32_e32 v8, v24
+; GCN-NEXT: v_mov_b32_e32 v9, v25
+; GCN-NEXT: v_mov_b32_e32 v10, v26
+; GCN-NEXT: v_mov_b32_e32 v11, v27
+; GCN-NEXT: v_mov_b32_e32 v12, v28
+; GCN-NEXT: v_mov_b32_e32 v13, v29
+; GCN-NEXT: v_mov_b32_e32 v14, v30
+; GCN-NEXT: v_mov_b32_e32 v15, v31
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.v8i32.v8i32(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2,
i32 4, ; cbsz
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
index 5475fa2..63466f8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.xf32.gfx942.ll
@@ -9,22 +9,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
; GFX942-SDAG-LABEL: test_mfma_f32_16x16x8xf32:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, 1.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 2.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0x40400000
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 4.0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, 0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[4:5], v[0:1], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[8:9], v[4:5], v[0:3] cbsz:1 abid:2 blgp:3
; GFX942-SDAG-NEXT: s_nop 6
-; GFX942-SDAG-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_16x16x8xf32:
@@ -32,22 +30,20 @@ define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_mov_b32 s4, 1.0
; GFX942-GISEL-NEXT: s_mov_b32 s5, 2.0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x40400000
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s5, 4.0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_16x16x8_xf32 v[0:3], v[4:5], v[6:7], v[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GFX942-GISEL-NEXT: s_nop 5
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
@@ -107,37 +103,29 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
; GFX942-SDAG-LABEL: test_mfma_f32_32x32x4xf32:
; GFX942-SDAG: ; %bb.0: ; %bb
; GFX942-SDAG-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, 2.0
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0x40400000
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, 1.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, 2.0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0x40400000
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, 4.0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-SDAG-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-SDAG-NEXT: s_nop 1
-; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[2:3], v[0:1], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[18:19], v[16:17], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, 0
; GFX942-SDAG-NEXT: s_nop 9
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GFX942-SDAG-NEXT: s_endpgm
;
; GFX942-GISEL-LABEL: test_mfma_f32_32x32x4xf32:
@@ -145,37 +133,29 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
; GFX942-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GFX942-GISEL-NEXT: s_mov_b32 s18, 1.0
; GFX942-GISEL-NEXT: s_mov_b32 s19, 2.0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[18:19]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[18:19]
; GFX942-GISEL-NEXT: s_mov_b32 s18, 0x40400000
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GFX942-GISEL-NEXT: s_mov_b32 s19, 4.0
-; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[18:19]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GFX942-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GFX942-GISEL-NEXT: s_nop 1
-; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
-; GFX942-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT: v_mfma_f32_32x32x4_xf32 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v16, 0
; GFX942-GISEL-NEXT: s_nop 9
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GFX942-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GFX942-GISEL-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
index bc72687..9436b49 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.iterative.ll
@@ -9,126 +9,127 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0
; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
-; GCN-MINREG-NEXT: v_mov_b32_e32 v2, 1.0
-; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 2.0
+; GCN-MINREG-NEXT: v_mov_b32_e32 v34, 1.0
+; GCN-MINREG-NEXT: v_mov_b32_e32 v33, 2.0
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_add_u32_e32 v4, s0, v0
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:112
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:96
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:80
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:64
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:16
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:32
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:48
+; GCN-MINREG-NEXT: v_add_u32_e32 v36, s0, v0
+; GCN-MINREG-NEXT: v_add_u32_e32 v37, s1, v0
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:112
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:96
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:80
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:64
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:16
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:32
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:48
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
-; GCN-MINREG-NEXT: v_add_u32_e32 v5, s1, v0
-; GCN-MINREG-NEXT: v_mov_b32_e32 v0, s1
-; GCN-MINREG-NEXT: v_add_u32_e32 v3, 0x6000, v4
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31]
+; GCN-MINREG-NEXT: v_mov_b32_e32 v32, s1
+; GCN-MINREG-NEXT: v_add_u32_e32 v35, 0x6000, v36
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[28:31] offset:112
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[24:27] offset:96
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[20:23] offset:80
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[16:19] offset:64
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[12:15] offset:48
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[8:11] offset:32
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[4:7] offset:16
-; GCN-MINREG-NEXT: ds_write_b128 v5, a[0:3]
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:8304
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:8288
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:8272
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:8256
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:8240
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:8224
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:8208
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:8192
+; GCN-MINREG-NEXT: s_nop 0
+; GCN-MINREG-NEXT: ds_write_b128 v37, v[28:31] offset:112
+; GCN-MINREG-NEXT: ds_write_b128 v37, v[24:27] offset:96
+; GCN-MINREG-NEXT: ds_write_b128 v37, v[20:23] offset:80
+; GCN-MINREG-NEXT: ds_write_b128 v37, v[16:19] offset:64
+; GCN-MINREG-NEXT: ds_write_b128 v37, v[12:15] offset:48
+; GCN-MINREG-NEXT: ds_write_b128 v37, v[8:11] offset:32
+; GCN-MINREG-NEXT: ds_write_b128 v37, v[4:7] offset:16
+; GCN-MINREG-NEXT: ds_write_b128 v37, v[0:3]
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:8304
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:8288
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:8272
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:8256
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:8240
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:8224
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:8208
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36 offset:8192
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:8208
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:24688
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:24672
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:24656
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:24640
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:24624
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:24608
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:24592
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:24576
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[24:27] offset:8288
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[28:31] offset:8304
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[16:19] offset:8256
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[20:23] offset:8272
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[8:11] offset:8224
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[12:15] offset:8240
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[0:3] offset:8192
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[4:7] offset:8208
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:24688
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:24672
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:24656
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:24640
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:24624
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:24608
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:24592
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36 offset:24576
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:16480
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:16496
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:16448
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:16464
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:16416
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:16432
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:16384
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:16400
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:49264
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:49248
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:49232
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:49216
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:49200
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:49184
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:49168
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:49152
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[24:27] offset:16480
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[28:31] offset:16496
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[16:19] offset:16448
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[20:23] offset:16464
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[8:11] offset:16416
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[12:15] offset:16432
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[0:3] offset:16384
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[4:7] offset:16400
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:49264
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:49248
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:49232
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:49216
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:49200
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:49184
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:49168
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36 offset:49152
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:24672
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:24688
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:24640
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:24656
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:24608
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:24624
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:24576
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:24592
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:57456
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:57440
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:57424
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:57408
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:57344
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:57360
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:57376
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:57392
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[24:27] offset:24672
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[28:31] offset:24688
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[16:19] offset:24640
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[20:23] offset:24656
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[8:11] offset:24608
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[12:15] offset:24624
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[0:3] offset:24576
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[4:7] offset:24592
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:57456
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:57440
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:57424
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:57408
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35 offset:57344
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:57360
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:57376
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:57392
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v1, a[0:31]
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v33, v[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[24:27] offset:32864
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[28:31] offset:32880
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[16:19] offset:32832
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[20:23] offset:32848
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[8:11] offset:32800
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[12:15] offset:32816
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[0:3] offset:32768
-; GCN-MINREG-NEXT: ds_write_b128 v0, a[4:7] offset:32784
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[24:27] offset:32864
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[28:31] offset:32880
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[16:19] offset:32832
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[20:23] offset:32848
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[8:11] offset:32800
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[12:15] offset:32816
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[0:3] offset:32768
+; GCN-MINREG-NEXT: ds_write_b128 v32, v[4:7] offset:32784
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: s_endpgm
;
@@ -136,128 +137,128 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-MAXOCC: ; %bb.0: ; %entry
; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GCN-MAXOCC-NEXT: v_and_b32_e32 v1, 0x1ff80, v0
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 1.0
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, 2.0
+; GCN-MAXOCC-NEXT: v_and_b32_e32 v33, 0x1ff80, v0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v34, 1.0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v35, 2.0
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v1
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, s0, v33
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:112
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:96
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:80
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:64
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:16
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:32
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:48
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v1, s1, v1
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v33, s1, v33
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:112
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:96
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:80
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:64
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:48
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3]
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:112
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:96
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:80
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:64
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:48
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:32
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:16
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3]
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:8304
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:8288
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:8272
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:8256
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:8240
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:8224
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:8208
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:8192
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, s1
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v33, s1
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:8288
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:8304
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:8256
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:8272
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:8224
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:8240
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:8192
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:8208
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:8288
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:8304
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:8256
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:8272
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:8224
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:8240
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3] offset:8192
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:8208
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:24688
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:24672
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:24656
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:24640
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:24624
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:24608
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:24592
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:24576
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 2
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:16480
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:16496
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:16448
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:16464
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:16416
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:16432
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:16384
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:16400
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:16480
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:16496
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:16448
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:16464
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:16416
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:16432
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3] offset:16384
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:16400
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:49264
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:49248
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:49232
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:49216
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:49200
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:49184
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:49168
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:49152
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, 0x6000, v32
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:24672
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:24688
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:24640
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:24656
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:24608
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:24624
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:24576
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:24592
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:24672
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:24688
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:24640
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:24656
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:24608
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:24624
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3] offset:24576
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:24592
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:57456
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:57440
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:57424
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:57408
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:57344
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:57360
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:57376
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:57392
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v34, v35, v[0:31]
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 2
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[24:27] offset:32864
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[28:31] offset:32880
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[16:19] offset:32832
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[20:23] offset:32848
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[8:11] offset:32800
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[12:15] offset:32816
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[0:3] offset:32768
-; GCN-MAXOCC-NEXT: ds_write_b128 v1, a[4:7] offset:32784
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[24:27] offset:32864
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[28:31] offset:32880
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[16:19] offset:32832
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[20:23] offset:32848
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[8:11] offset:32800
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[12:15] offset:32816
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[0:3] offset:32768
+; GCN-MAXOCC-NEXT: ds_write_b128 v33, v[4:7] offset:32784
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: s_endpgm
;
@@ -265,120 +266,120 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr
; GCN-ILP: ; %bb.0: ; %entry
; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GCN-ILP-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
-; GCN-ILP-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-ILP-NEXT: v_mov_b32_e32 v2, 2.0
+; GCN-ILP-NEXT: v_and_b32_e32 v32, 0x1ff80, v0
+; GCN-ILP-NEXT: v_mov_b32_e32 v33, 1.0
+; GCN-ILP-NEXT: v_mov_b32_e32 v34, 2.0
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v0
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112
+; GCN-ILP-NEXT: v_add_u32_e32 v35, s0, v32
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:48
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:32
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:16
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:64
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:80
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:96
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:112
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT: v_add_u32_e32 v0, s1, v0
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
+; GCN-ILP-NEXT: v_add_u32_e32 v32, s1, v32
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:112
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:96
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:80
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:64
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:48
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3]
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304
+; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:112
+; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:96
+; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:80
+; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:64
+; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:48
+; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:32
+; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:16
+; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3]
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:8192
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:8208
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:8224
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:8240
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:8256
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:8272
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:8288
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:8304
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT: v_mov_b32_e32 v0, s1
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
+; GCN-ILP-NEXT: v_mov_b32_e32 v32, s1
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:8288
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:8304
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:8256
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:8272
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:8224
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:8240
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:8192
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:8208
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688
+; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:8288
+; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:8304
+; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:8256
+; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:8272
+; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:8224
+; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:8240
+; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] offset:8192
+; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:8208
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:24576
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:24592
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:24608
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:24624
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:24640
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:24656
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:24672
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:24688
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 2
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:16400
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:16384
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:16432
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:16416
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:16464
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:16448
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:16496
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:16480
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248
+; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:16400
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:49168
+; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] offset:16384
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:49152
+; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:16432
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:49200
+; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:16416
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:49184
+; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:16464
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:49232
+; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:16448
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:49216
+; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:16496
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:49264
+; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:16480
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:49248
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
+; GCN-ILP-NEXT: v_add_u32_e32 v35, 0x6000, v35
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:24592
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:24576
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:24624
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:24608
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:24656
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:24640
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:24688
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:24672
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440
+; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:24592
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:57360
+; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] offset:24576
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:57344
+; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:24624
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:57392
+; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:24608
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:57376
+; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:24656
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:57424
+; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:24640
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:57408
+; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:24688
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:57456
+; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:24672
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:57440
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 2
-; GCN-ILP-NEXT: ds_write_b128 v0, a[24:27] offset:32864
-; GCN-ILP-NEXT: ds_write_b128 v0, a[28:31] offset:32880
-; GCN-ILP-NEXT: ds_write_b128 v0, a[16:19] offset:32832
-; GCN-ILP-NEXT: ds_write_b128 v0, a[20:23] offset:32848
-; GCN-ILP-NEXT: ds_write_b128 v0, a[8:11] offset:32800
-; GCN-ILP-NEXT: ds_write_b128 v0, a[12:15] offset:32816
-; GCN-ILP-NEXT: ds_write_b128 v0, a[0:3] offset:32768
-; GCN-ILP-NEXT: ds_write_b128 v0, a[4:7] offset:32784
+; GCN-ILP-NEXT: ds_write_b128 v32, v[24:27] offset:32864
+; GCN-ILP-NEXT: ds_write_b128 v32, v[28:31] offset:32880
+; GCN-ILP-NEXT: ds_write_b128 v32, v[16:19] offset:32832
+; GCN-ILP-NEXT: ds_write_b128 v32, v[20:23] offset:32848
+; GCN-ILP-NEXT: ds_write_b128 v32, v[8:11] offset:32800
+; GCN-ILP-NEXT: ds_write_b128 v32, v[12:15] offset:32816
+; GCN-ILP-NEXT: ds_write_b128 v32, v[0:3] offset:32768
+; GCN-ILP-NEXT: ds_write_b128 v32, v[4:7] offset:32784
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
@@ -455,129 +456,129 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MINREG: ; %bb.0: ; %entry
; GCN-MINREG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MINREG-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GCN-MINREG-NEXT: v_and_b32_e32 v2, 0x1ff80, v0
-; GCN-MINREG-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-MINREG-NEXT: v_mov_b32_e32 v0, 2.0
+; GCN-MINREG-NEXT: v_and_b32_e32 v0, 0x1ff80, v0
+; GCN-MINREG-NEXT: v_mov_b32_e32 v33, 1.0
+; GCN-MINREG-NEXT: v_mov_b32_e32 v32, 2.0
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_add_u32_e32 v3, s0, v2
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:112
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:96
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:80
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:64
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:16
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:32
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:48
+; GCN-MINREG-NEXT: v_add_u32_e32 v35, s0, v0
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:112
+; GCN-MINREG-NEXT: v_add_u32_e32 v34, s1, v0
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:96
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:80
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:64
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:16
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:32
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:48
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
-; GCN-MINREG-NEXT: v_add_u32_e32 v2, s1, v2
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
-; GCN-MINREG-NEXT: s_nop 1
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:112
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:96
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:80
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:64
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:48
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3]
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:8304
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:8288
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:8272
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:8256
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:8240
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:8224
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:8208
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:8192
+; GCN-MINREG-NEXT: s_nop 2
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:112
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:96
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:80
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:64
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:48
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:32
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:16
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3]
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:8304
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:8288
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:8272
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:8256
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:8240
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:8224
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:8208
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35 offset:8192
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
-; GCN-MINREG-NEXT: v_mov_b32_e32 v2, s1
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
+; GCN-MINREG-NEXT: v_mov_b32_e32 v34, s1
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 1
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:8288
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:8304
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:8256
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:8272
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:8224
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:8240
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:8192
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:8208
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:8288
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:8304
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:8256
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:8272
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:8224
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:8240
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3] offset:8192
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:8208
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_barrier mask(0x00000000)
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:24688
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:24672
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:24656
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:24640
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:24576
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:24592
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:24608
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:24624
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:24688
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:24672
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:24656
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:24640
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35 offset:24576
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:24592
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:24608
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:24624
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
-; GCN-MINREG-NEXT: v_add_u32_e32 v4, 0x6000, v3
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
+; GCN-MINREG-NEXT: v_add_u32_e32 v36, 0x6000, v35
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 1
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:16496
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:16480
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:16464
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:16448
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:16432
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:16416
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:16400
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:16384
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v3 offset:49264
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v3 offset:49248
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v3 offset:49232
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v3 offset:49216
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v3 offset:49200
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v3 offset:49184
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v3 offset:49168
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v3 offset:49152
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:16496
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:16480
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:16464
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:16448
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:16432
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:16416
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:16400
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3] offset:16384
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v35 offset:49264
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v35 offset:49248
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v35 offset:49232
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v35 offset:49216
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v35 offset:49200
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v35 offset:49184
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v35 offset:49168
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v35 offset:49152
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:24688
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:24672
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:24656
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:24640
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:24624
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:24608
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:24592
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:24576
-; GCN-MINREG-NEXT: ds_read_b128 a[28:31], v4 offset:57456
-; GCN-MINREG-NEXT: ds_read_b128 a[24:27], v4 offset:57440
-; GCN-MINREG-NEXT: ds_read_b128 a[20:23], v4 offset:57424
-; GCN-MINREG-NEXT: ds_read_b128 a[16:19], v4 offset:57408
-; GCN-MINREG-NEXT: ds_read_b128 a[0:3], v4 offset:57344
-; GCN-MINREG-NEXT: ds_read_b128 a[4:7], v4 offset:57360
-; GCN-MINREG-NEXT: ds_read_b128 a[8:11], v4 offset:57376
-; GCN-MINREG-NEXT: ds_read_b128 a[12:15], v4 offset:57392
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:24688
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:24672
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:24656
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:24640
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:24624
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:24608
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:24592
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3] offset:24576
+; GCN-MINREG-NEXT: ds_read_b128 v[28:31], v36 offset:57456
+; GCN-MINREG-NEXT: ds_read_b128 v[24:27], v36 offset:57440
+; GCN-MINREG-NEXT: ds_read_b128 v[20:23], v36 offset:57424
+; GCN-MINREG-NEXT: ds_read_b128 v[16:19], v36 offset:57408
+; GCN-MINREG-NEXT: ds_read_b128 v[0:3], v36 offset:57344
+; GCN-MINREG-NEXT: ds_read_b128 v[4:7], v36 offset:57360
+; GCN-MINREG-NEXT: ds_read_b128 v[8:11], v36 offset:57376
+; GCN-MINREG-NEXT: ds_read_b128 v[12:15], v36 offset:57392
; GCN-MINREG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GCN-MINREG-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MINREG-NEXT: s_nop 15
; GCN-MINREG-NEXT: s_nop 2
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[28:31] offset:32880
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[24:27] offset:32864
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[20:23] offset:32848
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[16:19] offset:32832
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[12:15] offset:32816
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[8:11] offset:32800
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[4:7] offset:32784
-; GCN-MINREG-NEXT: ds_write_b128 v2, a[0:3] offset:32768
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[28:31] offset:32880
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[24:27] offset:32864
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[20:23] offset:32848
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[16:19] offset:32832
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[12:15] offset:32816
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[8:11] offset:32800
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[4:7] offset:32784
+; GCN-MINREG-NEXT: ds_write_b128 v34, v[0:3] offset:32768
; GCN-MINREG-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MINREG-NEXT: s_endpgm
;
@@ -585,129 +586,129 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-MAXOCC: ; %bb.0: ; %entry
; GCN-MAXOCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-MAXOCC-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GCN-MAXOCC-NEXT: v_and_b32_e32 v3, 0x1ff80, v0
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v2, 2.0
+; GCN-MAXOCC-NEXT: v_and_b32_e32 v35, 0x1ff80, v0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v33, 1.0
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v34, 2.0
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, s0, v3
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:112
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:96
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:80
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:64
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:16
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:32
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:48
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, s0, v35
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:112
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:96
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:80
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:64
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:16
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:32
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:48
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v3, s1, v3
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v35, s1, v35
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:112
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:96
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:80
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:64
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:48
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3]
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:8304
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:8288
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:8272
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:8256
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:8240
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:8224
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:8208
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:8192
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:112
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:96
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:80
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:64
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:48
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:32
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:16
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3]
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:8304
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:8288
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:8272
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:8256
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:8240
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:8224
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:8208
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:8192
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-MAXOCC-NEXT: v_mov_b32_e32 v3, s1
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
+; GCN-MAXOCC-NEXT: v_mov_b32_e32 v35, s1
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:8288
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:8304
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:8256
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:8272
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:8224
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:8240
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:8192
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:8208
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:8288
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:8304
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:8256
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:8272
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:8224
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:8240
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3] offset:8192
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:8208
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_barrier mask(0x00000000)
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:24688
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:24672
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:24656
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:24640
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:24576
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:24592
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:24608
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:24624
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:24688
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:24672
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:24656
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:24640
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:24576
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:24592
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:24608
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:24624
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 2
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:16496
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:16480
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:16464
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:16448
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:16432
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:16416
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:16400
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:16384
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:49264
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:49248
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:49232
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:49216
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:49200
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:49184
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:49168
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:49152
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:16496
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:16480
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:16464
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:16448
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:16432
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:16416
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:16400
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3] offset:16384
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:49264
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:49248
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:49232
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:49216
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:49200
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:49184
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:49168
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:49152
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
-; GCN-MAXOCC-NEXT: v_add_u32_e32 v0, 0x6000, v0
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
+; GCN-MAXOCC-NEXT: v_add_u32_e32 v32, 0x6000, v32
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 1
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:24688
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:24672
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:24656
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:24640
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:24624
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:24608
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:24592
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:24576
-; GCN-MAXOCC-NEXT: ds_read_b128 a[28:31], v0 offset:57456
-; GCN-MAXOCC-NEXT: ds_read_b128 a[24:27], v0 offset:57440
-; GCN-MAXOCC-NEXT: ds_read_b128 a[20:23], v0 offset:57424
-; GCN-MAXOCC-NEXT: ds_read_b128 a[16:19], v0 offset:57408
-; GCN-MAXOCC-NEXT: ds_read_b128 a[0:3], v0 offset:57344
-; GCN-MAXOCC-NEXT: ds_read_b128 a[4:7], v0 offset:57360
-; GCN-MAXOCC-NEXT: ds_read_b128 a[8:11], v0 offset:57376
-; GCN-MAXOCC-NEXT: ds_read_b128 a[12:15], v0 offset:57392
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:24688
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:24672
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:24656
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:24640
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:24624
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:24608
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:24592
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3] offset:24576
+; GCN-MAXOCC-NEXT: ds_read_b128 v[28:31], v32 offset:57456
+; GCN-MAXOCC-NEXT: ds_read_b128 v[24:27], v32 offset:57440
+; GCN-MAXOCC-NEXT: ds_read_b128 v[20:23], v32 offset:57424
+; GCN-MAXOCC-NEXT: ds_read_b128 v[16:19], v32 offset:57408
+; GCN-MAXOCC-NEXT: ds_read_b128 v[0:3], v32 offset:57344
+; GCN-MAXOCC-NEXT: ds_read_b128 v[4:7], v32 offset:57360
+; GCN-MAXOCC-NEXT: ds_read_b128 v[8:11], v32 offset:57376
+; GCN-MAXOCC-NEXT: ds_read_b128 v[12:15], v32 offset:57392
; GCN-MAXOCC-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; GCN-MAXOCC-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v34, v[0:31]
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-MAXOCC-NEXT: s_nop 15
; GCN-MAXOCC-NEXT: s_nop 2
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[28:31] offset:32880
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[24:27] offset:32864
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[20:23] offset:32848
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[16:19] offset:32832
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[12:15] offset:32816
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[8:11] offset:32800
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[4:7] offset:32784
-; GCN-MAXOCC-NEXT: ds_write_b128 v3, a[0:3] offset:32768
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[28:31] offset:32880
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[24:27] offset:32864
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[20:23] offset:32848
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[16:19] offset:32832
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[12:15] offset:32816
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[8:11] offset:32800
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[4:7] offset:32784
+; GCN-MAXOCC-NEXT: ds_write_b128 v35, v[0:3] offset:32768
; GCN-MAXOCC-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-MAXOCC-NEXT: s_endpgm
;
@@ -715,127 +716,127 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave_spl
; GCN-ILP: ; %bb.0: ; %entry
; GCN-ILP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GCN-ILP-NEXT: v_lshlrev_b32_e32 v0, 7, v0
-; GCN-ILP-NEXT: v_and_b32_e32 v2, 0x1ff80, v0
-; GCN-ILP-NEXT: v_mov_b32_e32 v0, 1.0
-; GCN-ILP-NEXT: v_mov_b32_e32 v1, 2.0
+; GCN-ILP-NEXT: v_and_b32_e32 v34, 0x1ff80, v0
+; GCN-ILP-NEXT: v_mov_b32_e32 v32, 1.0
+; GCN-ILP-NEXT: v_mov_b32_e32 v33, 2.0
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_add_u32_e32 v3, s0, v2
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:48
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:32
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:16
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:64
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:80
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:96
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:112
+; GCN-ILP-NEXT: v_add_u32_e32 v35, s0, v34
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:48
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:32
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:16
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:64
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:80
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:96
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:112
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GCN-ILP-NEXT: v_add_u32_e32 v2, s1, v2
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
+; GCN-ILP-NEXT: v_add_u32_e32 v34, s1, v34
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3]
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:8192
-; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:8208
-; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:8224
-; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:48
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:8240
-; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:64
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:8256
-; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:80
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:8272
-; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:96
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:8288
-; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:112
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:8304
+; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3]
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:8192
+; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:16
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:8208
+; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:32
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:8224
+; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:48
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:8240
+; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:64
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:8256
+; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:80
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:8272
+; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:96
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:8288
+; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:112
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:8304
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GCN-ILP-NEXT: v_mov_b32_e32 v2, s1
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
+; GCN-ILP-NEXT: v_mov_b32_e32 v34, s1
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:8288
-; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:8304
-; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:8256
-; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:8272
-; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:8224
-; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:8240
-; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:8192
-; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:8208
+; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:8288
+; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:8304
+; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:8256
+; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:8272
+; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:8224
+; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:8240
+; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3] offset:8192
+; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:8208
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_barrier mask(0x00000000)
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:24624
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:24608
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:24592
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:24576
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:24640
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:24656
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:24672
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:24688
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:24624
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:24608
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:24592
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:24576
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:24640
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:24656
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:24672
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:24688
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 2
-; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:16496
-; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:16480
-; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:16464
-; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:16448
-; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:16432
-; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:16416
-; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:16400
-; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:16384
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:49152
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:49168
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:49184
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:49200
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:49216
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:49232
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:49248
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:49264
+; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:16496
+; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:16480
+; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:16464
+; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:16448
+; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:16432
+; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:16416
+; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:16400
+; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3] offset:16384
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:49152
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:49168
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:49184
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:49200
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:49216
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:49232
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:49248
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:49264
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GCN-ILP-NEXT: v_add_u32_e32 v3, 0x6000, v3
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
+; GCN-ILP-NEXT: v_add_u32_e32 v35, 0x6000, v35
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 1
-; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:24576
-; GCN-ILP-NEXT: ds_read_b128 a[0:3], v3 offset:57344
-; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:24592
-; GCN-ILP-NEXT: ds_read_b128 a[4:7], v3 offset:57360
-; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:24608
-; GCN-ILP-NEXT: ds_read_b128 a[8:11], v3 offset:57376
-; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:24624
-; GCN-ILP-NEXT: ds_read_b128 a[12:15], v3 offset:57392
-; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:24640
-; GCN-ILP-NEXT: ds_read_b128 a[16:19], v3 offset:57408
-; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:24656
-; GCN-ILP-NEXT: ds_read_b128 a[20:23], v3 offset:57424
-; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:24672
-; GCN-ILP-NEXT: ds_read_b128 a[24:27], v3 offset:57440
-; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:24688
-; GCN-ILP-NEXT: ds_read_b128 a[28:31], v3 offset:57456
+; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3] offset:24576
+; GCN-ILP-NEXT: ds_read_b128 v[0:3], v35 offset:57344
+; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:24592
+; GCN-ILP-NEXT: ds_read_b128 v[4:7], v35 offset:57360
+; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:24608
+; GCN-ILP-NEXT: ds_read_b128 v[8:11], v35 offset:57376
+; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:24624
+; GCN-ILP-NEXT: ds_read_b128 v[12:15], v35 offset:57392
+; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:24640
+; GCN-ILP-NEXT: ds_read_b128 v[16:19], v35 offset:57408
+; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:24656
+; GCN-ILP-NEXT: ds_read_b128 v[20:23], v35 offset:57424
+; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:24672
+; GCN-ILP-NEXT: ds_read_b128 v[24:27], v35 offset:57440
+; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:24688
+; GCN-ILP-NEXT: ds_read_b128 v[28:31], v35 offset:57456
; GCN-ILP-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GCN-ILP-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0)
; GCN-ILP-NEXT: s_nop 15
; GCN-ILP-NEXT: s_nop 2
-; GCN-ILP-NEXT: ds_write_b128 v2, a[28:31] offset:32880
-; GCN-ILP-NEXT: ds_write_b128 v2, a[24:27] offset:32864
-; GCN-ILP-NEXT: ds_write_b128 v2, a[20:23] offset:32848
-; GCN-ILP-NEXT: ds_write_b128 v2, a[16:19] offset:32832
-; GCN-ILP-NEXT: ds_write_b128 v2, a[12:15] offset:32816
-; GCN-ILP-NEXT: ds_write_b128 v2, a[8:11] offset:32800
-; GCN-ILP-NEXT: ds_write_b128 v2, a[4:7] offset:32784
-; GCN-ILP-NEXT: ds_write_b128 v2, a[0:3] offset:32768
+; GCN-ILP-NEXT: ds_write_b128 v34, v[28:31] offset:32880
+; GCN-ILP-NEXT: ds_write_b128 v34, v[24:27] offset:32864
+; GCN-ILP-NEXT: ds_write_b128 v34, v[20:23] offset:32848
+; GCN-ILP-NEXT: ds_write_b128 v34, v[16:19] offset:32832
+; GCN-ILP-NEXT: ds_write_b128 v34, v[12:15] offset:32816
+; GCN-ILP-NEXT: ds_write_b128 v34, v[8:11] offset:32800
+; GCN-ILP-NEXT: ds_write_b128 v34, v[4:7] offset:32784
+; GCN-ILP-NEXT: ds_write_b128 v34, v[0:3] offset:32768
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
; GCN-ILP-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index aa099b6..11d0099 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -misched-cluster=0 -amdgpu-mfma-vgpr-form=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
define amdgpu_kernel void @test_sched_group_barrier() #0 {
; GCN-LABEL: test_sched_group_barrier:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 6eb9449..abf741c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -1,6 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN-VGPR,SDAG-VGPR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN-VGPR,GISEL-VGPR %s
declare i32 @llvm.amdgcn.workitem.id.x()
@@ -62,6 +65,58 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7]
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 7
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[6:7]
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s16
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-VGPR-NEXT: s_nop 6
+; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[6:7]
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
@@ -82,6 +137,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_f16:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -97,6 +163,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
@@ -112,6 +189,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -163,6 +251,48 @@ define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[14:17], v[6:13], v4
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -237,6 +367,70 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_nop 10
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GISEL-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-VGPR-NEXT: s_nop 10
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
@@ -304,6 +498,64 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -366,6 +618,64 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
@@ -428,6 +738,64 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <16 x float> %result
}
@@ -524,6 +892,82 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[26:29], v[18:25], v16
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -560,6 +1004,32 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: s_nop 7
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
; GCN-NEXT: s_endpgm
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
+; GCN-VGPR: ; %bb.0: ; %bb
+; GCN-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GCN-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; GCN-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-VGPR-NEXT: v_mov_b32_e32 v17, s16
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GCN-VGPR-NEXT: s_nop 0
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; GCN-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
@@ -580,6 +1050,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -595,6 +1076,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
@@ -610,6 +1102,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -643,6 +1146,30 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_mov_b32_e32 v14, s0
+; GCN-VGPR-NEXT: v_mov_b32_e32 v15, s1
+; GCN-VGPR-NEXT: v_mov_b32_e32 v16, s2
+; GCN-VGPR-NEXT: v_mov_b32_e32 v17, s3
+; GCN-VGPR-NEXT: v_mov_b32_e32 v6, s16
+; GCN-VGPR-NEXT: v_mov_b32_e32 v7, s17
+; GCN-VGPR-NEXT: v_mov_b32_e32 v8, s18
+; GCN-VGPR-NEXT: v_mov_b32_e32 v9, s19
+; GCN-VGPR-NEXT: v_mov_b32_e32 v10, s20
+; GCN-VGPR-NEXT: v_mov_b32_e32 v11, s21
+; GCN-VGPR-NEXT: v_mov_b32_e32 v12, s22
+; GCN-VGPR-NEXT: v_mov_b32_e32 v13, s23
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GCN-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; GCN-VGPR-NEXT: s_nop 1
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[14:17], v[6:13], v4
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -685,6 +1212,38 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1)
; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
; GCN-NEXT: s_endpgm
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
+; GCN-VGPR: ; %bb.0: ; %bb
+; GCN-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GCN-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; GCN-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; GCN-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GCN-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; GCN-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[2:3]
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[0:1]
+; GCN-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GCN-VGPR-NEXT: s_nop 0
+; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GCN-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GCN-VGPR-NEXT: s_nop 10
+; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; GCN-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GCN-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
@@ -717,6 +1276,29 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
+; GCN-VGPR-NEXT: s_nop 11
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -744,6 +1326,29 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-VGPR-NEXT: s_nop 11
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
@@ -771,6 +1376,29 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-VGPR-NEXT: s_nop 11
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GCN-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <16 x float> %result
}
@@ -827,6 +1455,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
; GCN-NEXT: v_mov_b32_e32 v14, v26
; GCN-NEXT: v_mov_b32_e32 v15, v27
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_mov_b32_e32 v26, s0
+; GCN-VGPR-NEXT: v_mov_b32_e32 v27, s1
+; GCN-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GCN-VGPR-NEXT: v_mov_b32_e32 v29, s3
+; GCN-VGPR-NEXT: v_mov_b32_e32 v16, v10
+; GCN-VGPR-NEXT: v_mov_b32_e32 v15, v9
+; GCN-VGPR-NEXT: v_mov_b32_e32 v14, v8
+; GCN-VGPR-NEXT: v_mov_b32_e32 v13, v7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v12, v6
+; GCN-VGPR-NEXT: v_mov_b32_e32 v11, v5
+; GCN-VGPR-NEXT: v_mov_b32_e32 v10, v4
+; GCN-VGPR-NEXT: v_mov_b32_e32 v9, v3
+; GCN-VGPR-NEXT: v_mov_b32_e32 v8, v2
+; GCN-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; GCN-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GCN-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; GCN-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; GCN-VGPR-NEXT: v_mov_b32_e32 v18, s16
+; GCN-VGPR-NEXT: v_mov_b32_e32 v19, s17
+; GCN-VGPR-NEXT: v_mov_b32_e32 v20, s18
+; GCN-VGPR-NEXT: v_mov_b32_e32 v21, s19
+; GCN-VGPR-NEXT: v_mov_b32_e32 v22, s20
+; GCN-VGPR-NEXT: v_mov_b32_e32 v23, s21
+; GCN-VGPR-NEXT: v_mov_b32_e32 v24, s22
+; GCN-VGPR-NEXT: v_mov_b32_e32 v25, s23
+; GCN-VGPR-NEXT: s_nop 1
+; GCN-VGPR-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[26:29], v[18:25], v16
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -895,6 +1559,64 @@ define amdgpu_kernel void @test_smfmac_i32_16x16x128_i8__vgpr(ptr addrspace(1) %
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 7
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-VGPR-NEXT: s_nop 6
+; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i32>, ptr addrspace(1) %arg, i32 %id
@@ -915,6 +1637,17 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1,
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_i32_16x16x128_i8:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x i32> %result
}
@@ -930,6 +1663,17 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x i32> %result
}
@@ -945,6 +1689,17 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x i32> %result
}
@@ -996,6 +1751,48 @@ define <4 x i32> @test_smfmac_i32_16x16x128_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[14:17], v[6:13], v4
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_i32_16x16x128_i8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_i32_16x16x128_i8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x i32> %result
}
@@ -1076,6 +1873,76 @@ define amdgpu_kernel void @test_smfmac_i32_32x32x64_i8__vgpr(ptr addrspace(1) %a
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_nop 10
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-VGPR-NEXT: s_nop 10
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <16 x i32>, ptr addrspace(1) %arg, i32 %id
@@ -1143,6 +2010,64 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x i32> %result
}
@@ -1205,6 +2130,64 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x i32> %result
}
@@ -1267,6 +2250,64 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <16 x i32> %result
}
@@ -1363,6 +2404,82 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x i32> %result
}
@@ -1431,6 +2548,64 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 7
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-VGPR-NEXT: s_nop 6
+; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
@@ -1451,6 +2626,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32>
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -1466,6 +2652,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
@@ -1481,6 +2678,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -1532,6 +2740,48 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[14:17], v[6:13], v4
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -1600,6 +2850,64 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 7
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-VGPR-NEXT: s_nop 6
+; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
@@ -1620,6 +2928,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32>
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -1635,6 +2954,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
@@ -1650,6 +2980,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -1701,6 +3042,48 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[14:17], v[6:13], v4
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -1769,6 +3152,64 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr(ptr addrspace
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 7
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-VGPR-NEXT: s_nop 6
+; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
@@ -1789,6 +3230,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32>
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -1804,6 +3256,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
@@ -1819,6 +3282,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -1870,6 +3344,48 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[14:17], v[6:13], v4
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -1938,6 +3454,64 @@ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace
; GISEL-NEXT: s_nop 6
; GISEL-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s15
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: s_nop 7
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[14:17], v0, s[0:1]
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, s2
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[14:17], v[8:11], v[0:7], v12 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-VGPR-NEXT: s_nop 6
+; GISEL-VGPR-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1]
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
@@ -1958,6 +3532,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32>
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -1973,6 +3558,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
@@ -1988,6 +3584,17 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <
; GCN-NEXT: v_mov_b32_e32 v2, v14
; GCN-NEXT: v_mov_b32_e32 v3, v15
; GCN-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
+; GCN-VGPR: ; %bb.0:
+; GCN-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-VGPR-NEXT: s_nop 7
+; GCN-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GCN-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GCN-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GCN-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GCN-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -2039,6 +3646,48 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, s23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[14:17], v[6:13], v4
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
@@ -2119,6 +3768,76 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_nop 10
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-VGPR-NEXT: s_nop 10
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
@@ -2186,6 +3905,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -2248,6 +4025,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
@@ -2310,6 +4145,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <16 x float> %result
}
@@ -2406,6 +4299,82 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -2486,6 +4455,76 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_bf8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_nop 10
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-VGPR-NEXT: s_nop 10
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
@@ -2553,6 +4592,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -2615,6 +4712,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
@@ -2677,6 +4832,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <16 x float> %result
}
@@ -2773,6 +4986,82 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -2853,6 +5142,76 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_bf8__vgpr(ptr addrspace(
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_nop 10
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-VGPR-NEXT: s_nop 10
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
@@ -2920,6 +5279,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -2982,6 +5399,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
@@ -3044,6 +5519,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <16 x float> %result
}
@@ -3140,6 +5673,82 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -3220,6 +5829,76 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x64_fp8_fp8__vgpr(ptr addrspace(
; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
+; SDAG-VGPR: ; %bb.0: ; %bb
+; SDAG-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; SDAG-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; SDAG-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; SDAG-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7]
+; SDAG-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; SDAG-VGPR-NEXT: s_load_dword s16, s[4:5], 0x64
+; SDAG-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
+; SDAG-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s16
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0)
+; SDAG-VGPR-NEXT: s_nop 0
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-VGPR-NEXT: s_nop 10
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7]
+; SDAG-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; SDAG-VGPR-NEXT: s_endpgm
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__vgpr:
+; GISEL-VGPR: ; %bb.0: ; %bb
+; GISEL-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GISEL-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GISEL-VGPR-NEXT: v_lshlrev_b32_e32 v16, 6, v0
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1]
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GISEL-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
+; GISEL-VGPR-NEXT: s_load_dword s2, s[4:5], 0x64
+; GISEL-VGPR-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0)
+; GISEL-VGPR-NEXT: s_nop 0
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, 0
+; GISEL-VGPR-NEXT: s_nop 10
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GISEL-VGPR-NEXT: s_endpgm
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
@@ -3287,6 +5966,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
@@ -3349,6 +6086,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
}
@@ -3411,6 +6206,64 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-VGPR-NEXT: s_nop 11
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v48, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v49, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v50, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v51, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v30, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v31, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v32, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v33, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v34, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v35, v9
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v36, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v20
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v21
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v22
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v23
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <16 x float> %result
}
@@ -3507,6 +6360,82 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21
; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
+; SDAG-VGPR: ; %bb.0:
+; SDAG-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v26, s0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v27, s1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v28, s2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v29, s3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v16, v10
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v14, v8
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v13, v7
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v12, v6
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v11, v5
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v10, v4
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v9, v3
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v7, v1
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v6, v0
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v18, s16
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v19, s17
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v20, s18
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v21, s19
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v22, s20
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v23, s21
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v24, s22
+; SDAG-VGPR-NEXT: v_mov_b32_e32 v25, s23
+; SDAG-VGPR-NEXT: s_nop 1
+; SDAG-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[26:29], v[18:25], v16
+; SDAG-VGPR-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-VGPR-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
+; GISEL-VGPR: ; %bb.0:
+; GISEL-VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v0
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v1
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v2
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v3
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v4
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v16, v5
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v17, v6
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v18, v7
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v19, v8
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v20, v9
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v21, v10
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v0, s24
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v1, s25
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v2, s26
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v3, s27
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v4, s28
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v5, s29
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
+; GISEL-VGPR-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v6, v11
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v7, v12
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v8, v13
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v9, v14
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v10, v15
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v11, v16
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v12, v17
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v13, v18
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v14, v19
+; GISEL-VGPR-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-VGPR-NEXT: s_nop 1
+; GISEL-VGPR-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-VGPR-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
}
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 0af655df..b7efe1e 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -101,120 +101,120 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_zeroinit:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v14, v0
+; GFX90A-NEXT: v_mov_b32_e32 v15, v0
+; GFX90A-NEXT: v_mov_b32_e32 v16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v17, v0
+; GFX90A-NEXT: v_mov_b32_e32 v18, v0
+; GFX90A-NEXT: v_mov_b32_e32 v19, v0
+; GFX90A-NEXT: v_mov_b32_e32 v20, v0
+; GFX90A-NEXT: v_mov_b32_e32 v21, v0
+; GFX90A-NEXT: v_mov_b32_e32 v22, v0
+; GFX90A-NEXT: v_mov_b32_e32 v23, v0
+; GFX90A-NEXT: v_mov_b32_e32 v24, v0
+; GFX90A-NEXT: v_mov_b32_e32 v25, v0
+; GFX90A-NEXT: v_mov_b32_e32 v26, v0
+; GFX90A-NEXT: v_mov_b32_e32 v27, v0
+; GFX90A-NEXT: v_mov_b32_e32 v28, v0
+; GFX90A-NEXT: v_mov_b32_e32 v29, v0
+; GFX90A-NEXT: v_mov_b32_e32 v30, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_loop_zeroinit:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-NEXT: v_mov_b32_e32 v15, v0
+; GFX942-NEXT: v_mov_b32_e32 v16, v0
+; GFX942-NEXT: v_mov_b32_e32 v17, v0
+; GFX942-NEXT: v_mov_b32_e32 v18, v0
+; GFX942-NEXT: v_mov_b32_e32 v19, v0
+; GFX942-NEXT: v_mov_b32_e32 v20, v0
+; GFX942-NEXT: v_mov_b32_e32 v21, v0
+; GFX942-NEXT: v_mov_b32_e32 v22, v0
+; GFX942-NEXT: v_mov_b32_e32 v23, v0
+; GFX942-NEXT: v_mov_b32_e32 v24, v0
+; GFX942-NEXT: v_mov_b32_e32 v25, v0
+; GFX942-NEXT: v_mov_b32_e32 v26, v0
+; GFX942-NEXT: v_mov_b32_e32 v27, v0
+; GFX942-NEXT: v_mov_b32_e32 v28, v0
+; GFX942-NEXT: v_mov_b32_e32 v29, v0
+; GFX942-NEXT: v_mov_b32_e32 v30, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, v0
; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB0_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 11
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
@@ -333,121 +333,119 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
; GFX90A-LABEL: test_mfma_loop_unfoldable_splat:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v14, v0
+; GFX90A-NEXT: v_mov_b32_e32 v15, v0
+; GFX90A-NEXT: v_mov_b32_e32 v16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v17, v0
+; GFX90A-NEXT: v_mov_b32_e32 v18, v0
+; GFX90A-NEXT: v_mov_b32_e32 v19, v0
+; GFX90A-NEXT: v_mov_b32_e32 v20, v0
+; GFX90A-NEXT: v_mov_b32_e32 v21, v0
+; GFX90A-NEXT: v_mov_b32_e32 v22, v0
+; GFX90A-NEXT: v_mov_b32_e32 v23, v0
+; GFX90A-NEXT: v_mov_b32_e32 v24, v0
+; GFX90A-NEXT: v_mov_b32_e32 v25, v0
+; GFX90A-NEXT: v_mov_b32_e32 v26, v0
+; GFX90A-NEXT: v_mov_b32_e32 v27, v0
+; GFX90A-NEXT: v_mov_b32_e32 v28, v0
+; GFX90A-NEXT: v_mov_b32_e32 v29, v0
+; GFX90A-NEXT: v_mov_b32_e32 v30, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_loop_unfoldable_splat:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-NEXT: v_mov_b32_e32 v15, v0
+; GFX942-NEXT: v_mov_b32_e32 v16, v0
+; GFX942-NEXT: v_mov_b32_e32 v17, v0
+; GFX942-NEXT: v_mov_b32_e32 v18, v0
+; GFX942-NEXT: v_mov_b32_e32 v19, v0
+; GFX942-NEXT: v_mov_b32_e32 v20, v0
+; GFX942-NEXT: v_mov_b32_e32 v21, v0
+; GFX942-NEXT: v_mov_b32_e32 v22, v0
+; GFX942-NEXT: v_mov_b32_e32 v23, v0
+; GFX942-NEXT: v_mov_b32_e32 v24, v0
+; GFX942-NEXT: v_mov_b32_e32 v25, v0
+; GFX942-NEXT: v_mov_b32_e32 v26, v0
+; GFX942-NEXT: v_mov_b32_e32 v27, v0
+; GFX942-NEXT: v_mov_b32_e32 v28, v0
+; GFX942-NEXT: v_mov_b32_e32 v29, v0
+; GFX942-NEXT: v_mov_b32_e32 v30, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, v0
; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB1_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 11
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
@@ -559,120 +557,120 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_non_splat:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v32
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v14, v0
+; GFX90A-NEXT: v_mov_b32_e32 v15, v0
+; GFX90A-NEXT: v_mov_b32_e32 v16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v17, v0
+; GFX90A-NEXT: v_mov_b32_e32 v18, v0
+; GFX90A-NEXT: v_mov_b32_e32 v19, v0
+; GFX90A-NEXT: v_mov_b32_e32 v20, v0
+; GFX90A-NEXT: v_mov_b32_e32 v21, v0
+; GFX90A-NEXT: v_mov_b32_e32 v22, v0
+; GFX90A-NEXT: v_mov_b32_e32 v23, v0
+; GFX90A-NEXT: v_mov_b32_e32 v24, v0
+; GFX90A-NEXT: v_mov_b32_e32 v25, v0
+; GFX90A-NEXT: v_mov_b32_e32 v26, v0
+; GFX90A-NEXT: v_mov_b32_e32 v27, v0
+; GFX90A-NEXT: v_mov_b32_e32 v28, v0
+; GFX90A-NEXT: v_mov_b32_e32 v29, v0
+; GFX90A-NEXT: v_mov_b32_e32 v30, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_loop_non_splat:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, v32
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-NEXT: v_mov_b32_e32 v15, v0
+; GFX942-NEXT: v_mov_b32_e32 v16, v0
+; GFX942-NEXT: v_mov_b32_e32 v17, v0
+; GFX942-NEXT: v_mov_b32_e32 v18, v0
+; GFX942-NEXT: v_mov_b32_e32 v19, v0
+; GFX942-NEXT: v_mov_b32_e32 v20, v0
+; GFX942-NEXT: v_mov_b32_e32 v21, v0
+; GFX942-NEXT: v_mov_b32_e32 v22, v0
+; GFX942-NEXT: v_mov_b32_e32 v23, v0
+; GFX942-NEXT: v_mov_b32_e32 v24, v0
+; GFX942-NEXT: v_mov_b32_e32 v25, v0
+; GFX942-NEXT: v_mov_b32_e32 v26, v0
+; GFX942-NEXT: v_mov_b32_e32 v27, v0
+; GFX942-NEXT: v_mov_b32_e32 v28, v0
+; GFX942-NEXT: v_mov_b32_e32 v29, v0
+; GFX942-NEXT: v_mov_b32_e32 v30, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, v0
; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB2_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 11
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
@@ -821,184 +819,120 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
;
; GFX90A-LABEL: test_mfma_loop_unfoldable_seq:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, 0x431a0000
+; GFX90A-NEXT: v_mov_b32_e32 v30, 0x43190000
+; GFX90A-NEXT: v_mov_b32_e32 v29, 0x43180000
+; GFX90A-NEXT: v_mov_b32_e32 v28, 0x43170000
+; GFX90A-NEXT: v_mov_b32_e32 v27, 0x43160000
+; GFX90A-NEXT: v_mov_b32_e32 v26, 0x43150000
+; GFX90A-NEXT: v_mov_b32_e32 v25, 0x43140000
+; GFX90A-NEXT: v_mov_b32_e32 v24, 0x43130000
+; GFX90A-NEXT: v_mov_b32_e32 v23, 0x43120000
+; GFX90A-NEXT: v_mov_b32_e32 v22, 0x43110000
+; GFX90A-NEXT: v_mov_b32_e32 v21, 0x43100000
+; GFX90A-NEXT: v_mov_b32_e32 v20, 0x430f0000
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0x430e0000
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0x430d0000
+; GFX90A-NEXT: v_mov_b32_e32 v17, 0x430c0000
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0x430b0000
+; GFX90A-NEXT: v_mov_b32_e32 v15, 0x430a0000
+; GFX90A-NEXT: v_mov_b32_e32 v14, 0x43090000
+; GFX90A-NEXT: v_mov_b32_e32 v13, 0x43080000
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0x43070000
+; GFX90A-NEXT: v_mov_b32_e32 v11, 0x43060000
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0x43050000
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0x43040000
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0x43030000
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0x43020000
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0x43010000
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x43000000
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0x42fe0000
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x42fc0000
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x42fa0000
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f80000
; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_loop_unfoldable_seq:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, 0x431a0000
+; GFX942-NEXT: v_mov_b32_e32 v30, 0x43190000
+; GFX942-NEXT: v_mov_b32_e32 v29, 0x43180000
+; GFX942-NEXT: v_mov_b32_e32 v28, 0x43170000
+; GFX942-NEXT: v_mov_b32_e32 v27, 0x43160000
+; GFX942-NEXT: v_mov_b32_e32 v26, 0x43150000
+; GFX942-NEXT: v_mov_b32_e32 v25, 0x43140000
+; GFX942-NEXT: v_mov_b32_e32 v24, 0x43130000
+; GFX942-NEXT: v_mov_b32_e32 v23, 0x43120000
+; GFX942-NEXT: v_mov_b32_e32 v22, 0x43110000
+; GFX942-NEXT: v_mov_b32_e32 v21, 0x43100000
+; GFX942-NEXT: v_mov_b32_e32 v20, 0x430f0000
+; GFX942-NEXT: v_mov_b32_e32 v19, 0x430e0000
+; GFX942-NEXT: v_mov_b32_e32 v18, 0x430d0000
+; GFX942-NEXT: v_mov_b32_e32 v17, 0x430c0000
+; GFX942-NEXT: v_mov_b32_e32 v16, 0x430b0000
+; GFX942-NEXT: v_mov_b32_e32 v15, 0x430a0000
+; GFX942-NEXT: v_mov_b32_e32 v14, 0x43090000
+; GFX942-NEXT: v_mov_b32_e32 v13, 0x43080000
+; GFX942-NEXT: v_mov_b32_e32 v12, 0x43070000
+; GFX942-NEXT: v_mov_b32_e32 v11, 0x43060000
+; GFX942-NEXT: v_mov_b32_e32 v10, 0x43050000
+; GFX942-NEXT: v_mov_b32_e32 v9, 0x43040000
+; GFX942-NEXT: v_mov_b32_e32 v8, 0x43030000
+; GFX942-NEXT: v_mov_b32_e32 v7, 0x43020000
+; GFX942-NEXT: v_mov_b32_e32 v6, 0x43010000
+; GFX942-NEXT: v_mov_b32_e32 v5, 0x43000000
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x42fe0000
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x42fc0000
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x42fa0000
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f80000
; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB3_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 11
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
@@ -1111,121 +1045,119 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-LABEL: test_mfma_loop_vgpr_init:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v14, v0
+; GFX90A-NEXT: v_mov_b32_e32 v15, v0
+; GFX90A-NEXT: v_mov_b32_e32 v16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v17, v0
+; GFX90A-NEXT: v_mov_b32_e32 v18, v0
+; GFX90A-NEXT: v_mov_b32_e32 v19, v0
+; GFX90A-NEXT: v_mov_b32_e32 v20, v0
+; GFX90A-NEXT: v_mov_b32_e32 v21, v0
+; GFX90A-NEXT: v_mov_b32_e32 v22, v0
+; GFX90A-NEXT: v_mov_b32_e32 v23, v0
+; GFX90A-NEXT: v_mov_b32_e32 v24, v0
+; GFX90A-NEXT: v_mov_b32_e32 v25, v0
+; GFX90A-NEXT: v_mov_b32_e32 v26, v0
+; GFX90A-NEXT: v_mov_b32_e32 v27, v0
+; GFX90A-NEXT: v_mov_b32_e32 v28, v0
+; GFX90A-NEXT: v_mov_b32_e32 v29, v0
+; GFX90A-NEXT: v_mov_b32_e32 v30, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_loop_vgpr_init:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-NEXT: v_mov_b32_e32 v15, v0
+; GFX942-NEXT: v_mov_b32_e32 v16, v0
+; GFX942-NEXT: v_mov_b32_e32 v17, v0
+; GFX942-NEXT: v_mov_b32_e32 v18, v0
+; GFX942-NEXT: v_mov_b32_e32 v19, v0
+; GFX942-NEXT: v_mov_b32_e32 v20, v0
+; GFX942-NEXT: v_mov_b32_e32 v21, v0
+; GFX942-NEXT: v_mov_b32_e32 v22, v0
+; GFX942-NEXT: v_mov_b32_e32 v23, v0
+; GFX942-NEXT: v_mov_b32_e32 v24, v0
+; GFX942-NEXT: v_mov_b32_e32 v25, v0
+; GFX942-NEXT: v_mov_b32_e32 v26, v0
+; GFX942-NEXT: v_mov_b32_e32 v27, v0
+; GFX942-NEXT: v_mov_b32_e32 v28, v0
+; GFX942-NEXT: v_mov_b32_e32 v29, v0
+; GFX942-NEXT: v_mov_b32_e32 v30, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, v0
; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB4_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 11
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1378,124 +1310,122 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v4, s1
+; GFX90A-NEXT: v_mov_b32_e32 v5, s1
+; GFX90A-NEXT: v_mov_b32_e32 v6, s1
+; GFX90A-NEXT: v_mov_b32_e32 v7, s1
+; GFX90A-NEXT: v_mov_b32_e32 v8, s1
+; GFX90A-NEXT: v_mov_b32_e32 v9, s1
+; GFX90A-NEXT: v_mov_b32_e32 v10, s1
+; GFX90A-NEXT: v_mov_b32_e32 v11, s1
+; GFX90A-NEXT: v_mov_b32_e32 v12, s1
+; GFX90A-NEXT: v_mov_b32_e32 v13, s1
+; GFX90A-NEXT: v_mov_b32_e32 v14, s1
+; GFX90A-NEXT: v_mov_b32_e32 v15, s1
+; GFX90A-NEXT: v_mov_b32_e32 v16, s1
+; GFX90A-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-NEXT: v_mov_b32_e32 v18, s1
+; GFX90A-NEXT: v_mov_b32_e32 v19, s1
+; GFX90A-NEXT: v_mov_b32_e32 v20, s1
+; GFX90A-NEXT: v_mov_b32_e32 v21, s1
+; GFX90A-NEXT: v_mov_b32_e32 v22, s1
+; GFX90A-NEXT: v_mov_b32_e32 v23, s1
+; GFX90A-NEXT: v_mov_b32_e32 v24, s1
+; GFX90A-NEXT: v_mov_b32_e32 v25, s1
+; GFX90A-NEXT: v_mov_b32_e32 v26, s1
+; GFX90A-NEXT: v_mov_b32_e32 v27, s1
+; GFX90A-NEXT: v_mov_b32_e32 v28, s1
+; GFX90A-NEXT: v_mov_b32_e32 v29, s1
+; GFX90A-NEXT: v_mov_b32_e32 v30, s1
+; GFX90A-NEXT: v_mov_b32_e32 v31, s1
; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_loop_sgpr_init:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v0, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NEXT: v_mov_b32_e32 v2, s1
+; GFX942-NEXT: v_mov_b32_e32 v3, s1
+; GFX942-NEXT: v_mov_b32_e32 v4, s1
+; GFX942-NEXT: v_mov_b32_e32 v5, s1
+; GFX942-NEXT: v_mov_b32_e32 v6, s1
+; GFX942-NEXT: v_mov_b32_e32 v7, s1
+; GFX942-NEXT: v_mov_b32_e32 v8, s1
+; GFX942-NEXT: v_mov_b32_e32 v9, s1
+; GFX942-NEXT: v_mov_b32_e32 v10, s1
+; GFX942-NEXT: v_mov_b32_e32 v11, s1
+; GFX942-NEXT: v_mov_b32_e32 v12, s1
+; GFX942-NEXT: v_mov_b32_e32 v13, s1
+; GFX942-NEXT: v_mov_b32_e32 v14, s1
+; GFX942-NEXT: v_mov_b32_e32 v15, s1
+; GFX942-NEXT: v_mov_b32_e32 v16, s1
+; GFX942-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-NEXT: v_mov_b32_e32 v18, s1
+; GFX942-NEXT: v_mov_b32_e32 v19, s1
+; GFX942-NEXT: v_mov_b32_e32 v20, s1
+; GFX942-NEXT: v_mov_b32_e32 v21, s1
+; GFX942-NEXT: v_mov_b32_e32 v22, s1
+; GFX942-NEXT: v_mov_b32_e32 v23, s1
+; GFX942-NEXT: v_mov_b32_e32 v24, s1
+; GFX942-NEXT: v_mov_b32_e32 v25, s1
+; GFX942-NEXT: v_mov_b32_e32 v26, s1
+; GFX942-NEXT: v_mov_b32_e32 v27, s1
+; GFX942-NEXT: v_mov_b32_e32 v28, s1
+; GFX942-NEXT: v_mov_b32_e32 v29, s1
+; GFX942-NEXT: v_mov_b32_e32 v30, s1
+; GFX942-NEXT: v_mov_b32_e32 v31, s1
; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB5_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 11
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%tmp0 = insertelement <32 x float> poison, float %init, i32 0
@@ -1644,127 +1574,123 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX90A-LABEL: test_mfma_loop_mixed_init:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v9, v2
+; GFX90A-NEXT: v_mov_b32_e32 v10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v11, v2
+; GFX90A-NEXT: v_mov_b32_e32 v12, v2
+; GFX90A-NEXT: v_mov_b32_e32 v13, v2
+; GFX90A-NEXT: v_mov_b32_e32 v14, v2
+; GFX90A-NEXT: v_mov_b32_e32 v15, v2
+; GFX90A-NEXT: v_mov_b32_e32 v16, v2
+; GFX90A-NEXT: v_mov_b32_e32 v17, v2
+; GFX90A-NEXT: v_mov_b32_e32 v18, v2
+; GFX90A-NEXT: v_mov_b32_e32 v19, v2
+; GFX90A-NEXT: v_mov_b32_e32 v20, v2
+; GFX90A-NEXT: v_mov_b32_e32 v21, v2
+; GFX90A-NEXT: v_mov_b32_e32 v22, v2
+; GFX90A-NEXT: v_mov_b32_e32 v23, v2
+; GFX90A-NEXT: v_mov_b32_e32 v24, v2
+; GFX90A-NEXT: v_mov_b32_e32 v25, v2
+; GFX90A-NEXT: v_mov_b32_e32 v26, v2
+; GFX90A-NEXT: v_mov_b32_e32 v27, v2
+; GFX90A-NEXT: v_mov_b32_e32 v28, v2
+; GFX90A-NEXT: v_mov_b32_e32 v29, v2
+; GFX90A-NEXT: v_mov_b32_e32 v30, v2
+; GFX90A-NEXT: v_mov_b32_e32 v31, v2
; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_loop_mixed_init:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 2.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-NEXT: v_mov_b32_e32 v14, v2
+; GFX942-NEXT: v_mov_b32_e32 v15, v2
+; GFX942-NEXT: v_mov_b32_e32 v16, v2
+; GFX942-NEXT: v_mov_b32_e32 v17, v2
+; GFX942-NEXT: v_mov_b32_e32 v18, v2
+; GFX942-NEXT: v_mov_b32_e32 v19, v2
+; GFX942-NEXT: v_mov_b32_e32 v20, v2
+; GFX942-NEXT: v_mov_b32_e32 v21, v2
+; GFX942-NEXT: v_mov_b32_e32 v22, v2
+; GFX942-NEXT: v_mov_b32_e32 v23, v2
+; GFX942-NEXT: v_mov_b32_e32 v24, v2
+; GFX942-NEXT: v_mov_b32_e32 v25, v2
+; GFX942-NEXT: v_mov_b32_e32 v26, v2
+; GFX942-NEXT: v_mov_b32_e32 v27, v2
+; GFX942-NEXT: v_mov_b32_e32 v28, v2
+; GFX942-NEXT: v_mov_b32_e32 v29, v2
+; GFX942-NEXT: v_mov_b32_e32 v30, v2
+; GFX942-NEXT: v_mov_b32_e32 v31, v2
; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB6_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 11
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1882,58 +1808,58 @@ define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %ar
;
; GFX90A-LABEL: test_mfma_loop_mfma_forward_init:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, 0
; GFX90A-NEXT: .LBB7_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_loop_mfma_forward_init:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 2.0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
; GFX942-NEXT: .LBB7_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB7_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 11
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
@@ -2087,126 +2013,126 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_agpr_init:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0
; GFX90A-NEXT: s_mov_b32 s0, 16
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v14, v0
+; GFX90A-NEXT: v_mov_b32_e32 v15, v0
+; GFX90A-NEXT: v_mov_b32_e32 v16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v17, v0
+; GFX90A-NEXT: v_mov_b32_e32 v18, v0
+; GFX90A-NEXT: v_mov_b32_e32 v19, v0
+; GFX90A-NEXT: v_mov_b32_e32 v20, v0
+; GFX90A-NEXT: v_mov_b32_e32 v21, v0
+; GFX90A-NEXT: v_mov_b32_e32 v22, v0
+; GFX90A-NEXT: v_mov_b32_e32 v23, v0
+; GFX90A-NEXT: v_mov_b32_e32 v24, v0
+; GFX90A-NEXT: v_mov_b32_e32 v25, v0
+; GFX90A-NEXT: v_mov_b32_e32 v26, v0
+; GFX90A-NEXT: v_mov_b32_e32 v27, v0
+; GFX90A-NEXT: v_mov_b32_e32 v28, v0
+; GFX90A-NEXT: v_mov_b32_e32 v29, v0
+; GFX90A-NEXT: v_mov_b32_e32 v30, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 12
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_loop_agpr_init:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 2.0
; GFX942-NEXT: s_mov_b32 s0, 16
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-NEXT: v_mov_b32_e32 v15, v0
+; GFX942-NEXT: v_mov_b32_e32 v16, v0
+; GFX942-NEXT: v_mov_b32_e32 v17, v0
+; GFX942-NEXT: v_mov_b32_e32 v18, v0
+; GFX942-NEXT: v_mov_b32_e32 v19, v0
+; GFX942-NEXT: v_mov_b32_e32 v20, v0
+; GFX942-NEXT: v_mov_b32_e32 v21, v0
+; GFX942-NEXT: v_mov_b32_e32 v22, v0
+; GFX942-NEXT: v_mov_b32_e32 v23, v0
+; GFX942-NEXT: v_mov_b32_e32 v24, v0
+; GFX942-NEXT: v_mov_b32_e32 v25, v0
+; GFX942-NEXT: v_mov_b32_e32 v26, v0
+; GFX942-NEXT: v_mov_b32_e32 v27, v0
+; GFX942-NEXT: v_mov_b32_e32 v28, v0
+; GFX942-NEXT: v_mov_b32_e32 v29, v0
+; GFX942-NEXT: v_mov_b32_e32 v30, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, v0
; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB8_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 11
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
@@ -2366,41 +2292,41 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
;
; GFX90A-LABEL: test_mfma_nested_loop_zeroinit:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_mov_b32 s0, 0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v14, v0
+; GFX90A-NEXT: v_mov_b32_e32 v15, v0
+; GFX90A-NEXT: v_mov_b32_e32 v16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v17, v0
+; GFX90A-NEXT: v_mov_b32_e32 v18, v0
+; GFX90A-NEXT: v_mov_b32_e32 v19, v0
+; GFX90A-NEXT: v_mov_b32_e32 v20, v0
+; GFX90A-NEXT: v_mov_b32_e32 v21, v0
+; GFX90A-NEXT: v_mov_b32_e32 v22, v0
+; GFX90A-NEXT: v_mov_b32_e32 v23, v0
+; GFX90A-NEXT: v_mov_b32_e32 v24, v0
+; GFX90A-NEXT: v_mov_b32_e32 v25, v0
+; GFX90A-NEXT: v_mov_b32_e32 v26, v0
+; GFX90A-NEXT: v_mov_b32_e32 v27, v0
+; GFX90A-NEXT: v_mov_b32_e32 v28, v0
+; GFX90A-NEXT: v_mov_b32_e32 v29, v0
+; GFX90A-NEXT: v_mov_b32_e32 v30, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2
@@ -2409,7 +2335,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX90A-NEXT: ; Parent Loop BB9_1 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GFX90A-NEXT: s_add_i32 s1, s1, -1
; GFX90A-NEXT: s_cmp_lg_u32 s1, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB9_2
@@ -2420,56 +2346,56 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1
; GFX90A-NEXT: ; %bb.4: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_nop 9
-; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX90A-NEXT: s_endpgm
;
; GFX942-LABEL: test_mfma_nested_loop_zeroinit:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_mov_b32 s0, 0
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-NEXT: v_mov_b32_e32 v15, v0
+; GFX942-NEXT: v_mov_b32_e32 v16, v0
+; GFX942-NEXT: v_mov_b32_e32 v17, v0
+; GFX942-NEXT: v_mov_b32_e32 v18, v0
+; GFX942-NEXT: v_mov_b32_e32 v19, v0
+; GFX942-NEXT: v_mov_b32_e32 v20, v0
+; GFX942-NEXT: v_mov_b32_e32 v21, v0
+; GFX942-NEXT: v_mov_b32_e32 v22, v0
+; GFX942-NEXT: v_mov_b32_e32 v23, v0
+; GFX942-NEXT: v_mov_b32_e32 v24, v0
+; GFX942-NEXT: v_mov_b32_e32 v25, v0
+; GFX942-NEXT: v_mov_b32_e32 v26, v0
+; GFX942-NEXT: v_mov_b32_e32 v27, v0
+; GFX942-NEXT: v_mov_b32_e32 v28, v0
+; GFX942-NEXT: v_mov_b32_e32 v29, v0
+; GFX942-NEXT: v_mov_b32_e32 v30, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, v0
+; GFX942-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB9_2 Depth 2
@@ -2478,7 +2404,7 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX942-NEXT: ; Parent Loop BB9_1 Depth=1
; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31]
; GFX942-NEXT: s_add_i32 s1, s1, -1
; GFX942-NEXT: s_cmp_lg_u32 s1, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB9_2
@@ -2489,17 +2415,17 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX942-NEXT: s_cbranch_scc1 .LBB9_1
; GFX942-NEXT: ; %bb.4: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_nop 8
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
@@ -2613,163 +2539,97 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 {
; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_mov_b32 s4, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v14, v0
+; GFX90A-NEXT: v_mov_b32_e32 v15, v0
+; GFX90A-NEXT: v_mov_b32_e32 v16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v17, v0
+; GFX90A-NEXT: v_mov_b32_e32 v18, v0
+; GFX90A-NEXT: v_mov_b32_e32 v19, v0
+; GFX90A-NEXT: v_mov_b32_e32 v20, v0
+; GFX90A-NEXT: v_mov_b32_e32 v21, v0
+; GFX90A-NEXT: v_mov_b32_e32 v22, v0
+; GFX90A-NEXT: v_mov_b32_e32 v23, v0
+; GFX90A-NEXT: v_mov_b32_e32 v24, v0
+; GFX90A-NEXT: v_mov_b32_e32 v25, v0
+; GFX90A-NEXT: v_mov_b32_e32 v26, v0
+; GFX90A-NEXT: v_mov_b32_e32 v27, v0
+; GFX90A-NEXT: v_mov_b32_e32 v28, v0
+; GFX90A-NEXT: v_mov_b32_e32 v29, v0
+; GFX90A-NEXT: v_mov_b32_e32 v30, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v33, v32, v[0:31]
; GFX90A-NEXT: s_add_i32 s4, s4, -1
; GFX90A-NEXT: s_cmp_lg_u32 s4, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %exit
-; GFX90A-NEXT: s_nop 15
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-NEXT: v_mov_b32_e32 v15, v0
+; GFX942-NEXT: v_mov_b32_e32 v16, v0
+; GFX942-NEXT: v_mov_b32_e32 v17, v0
+; GFX942-NEXT: v_mov_b32_e32 v18, v0
+; GFX942-NEXT: v_mov_b32_e32 v19, v0
+; GFX942-NEXT: v_mov_b32_e32 v20, v0
+; GFX942-NEXT: v_mov_b32_e32 v21, v0
+; GFX942-NEXT: v_mov_b32_e32 v22, v0
+; GFX942-NEXT: v_mov_b32_e32 v23, v0
+; GFX942-NEXT: v_mov_b32_e32 v24, v0
+; GFX942-NEXT: v_mov_b32_e32 v25, v0
+; GFX942-NEXT: v_mov_b32_e32 v26, v0
+; GFX942-NEXT: v_mov_b32_e32 v27, v0
+; GFX942-NEXT: v_mov_b32_e32 v28, v0
+; GFX942-NEXT: v_mov_b32_e32 v29, v0
+; GFX942-NEXT: v_mov_b32_e32 v30, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, v0
; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v32, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB10_1
; GFX942-NEXT: ; %bb.2: ; %exit
-; GFX942-NEXT: s_nop 14
-; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_setpc_b64 s[30:31]
entry:
br label %for.cond.preheader
@@ -2871,163 +2731,97 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_mov_b32 s4, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, v32
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v14, v0
+; GFX90A-NEXT: v_mov_b32_e32 v15, v0
+; GFX90A-NEXT: v_mov_b32_e32 v16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v17, v0
+; GFX90A-NEXT: v_mov_b32_e32 v18, v0
+; GFX90A-NEXT: v_mov_b32_e32 v19, v0
+; GFX90A-NEXT: v_mov_b32_e32 v20, v0
+; GFX90A-NEXT: v_mov_b32_e32 v21, v0
+; GFX90A-NEXT: v_mov_b32_e32 v22, v0
+; GFX90A-NEXT: v_mov_b32_e32 v23, v0
+; GFX90A-NEXT: v_mov_b32_e32 v24, v0
+; GFX90A-NEXT: v_mov_b32_e32 v25, v0
+; GFX90A-NEXT: v_mov_b32_e32 v26, v0
+; GFX90A-NEXT: v_mov_b32_e32 v27, v0
+; GFX90A-NEXT: v_mov_b32_e32 v28, v0
+; GFX90A-NEXT: v_mov_b32_e32 v29, v0
+; GFX90A-NEXT: v_mov_b32_e32 v30, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, v0
; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v32, v33, v[0:31]
; GFX90A-NEXT: s_add_i32 s4, s4, -1
; GFX90A-NEXT: s_cmp_lg_u32 s4, 0
; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %exit
-; GFX90A-NEXT: s_nop 15
-; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX942-LABEL: test_mfma_loop_non_splat_ret_use:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v33, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, v32
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-NEXT: v_mov_b32_e32 v4, v0
+; GFX942-NEXT: v_mov_b32_e32 v5, v0
+; GFX942-NEXT: v_mov_b32_e32 v6, v0
+; GFX942-NEXT: v_mov_b32_e32 v7, v0
+; GFX942-NEXT: v_mov_b32_e32 v8, v0
+; GFX942-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-NEXT: v_mov_b32_e32 v11, v0
+; GFX942-NEXT: v_mov_b32_e32 v12, v0
+; GFX942-NEXT: v_mov_b32_e32 v13, v0
+; GFX942-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-NEXT: v_mov_b32_e32 v15, v0
+; GFX942-NEXT: v_mov_b32_e32 v16, v0
+; GFX942-NEXT: v_mov_b32_e32 v17, v0
+; GFX942-NEXT: v_mov_b32_e32 v18, v0
+; GFX942-NEXT: v_mov_b32_e32 v19, v0
+; GFX942-NEXT: v_mov_b32_e32 v20, v0
+; GFX942-NEXT: v_mov_b32_e32 v21, v0
+; GFX942-NEXT: v_mov_b32_e32 v22, v0
+; GFX942-NEXT: v_mov_b32_e32 v23, v0
+; GFX942-NEXT: v_mov_b32_e32 v24, v0
+; GFX942-NEXT: v_mov_b32_e32 v25, v0
+; GFX942-NEXT: v_mov_b32_e32 v26, v0
+; GFX942-NEXT: v_mov_b32_e32 v27, v0
+; GFX942-NEXT: v_mov_b32_e32 v28, v0
+; GFX942-NEXT: v_mov_b32_e32 v29, v0
+; GFX942-NEXT: v_mov_b32_e32 v30, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, v0
; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31]
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cbranch_scc1 .LBB11_1
; GFX942-NEXT: ; %bb.2: ; %exit
-; GFX942-NEXT: s_nop 14
-; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
-; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
-; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
-; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
-; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
-; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
-; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
-; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
-; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
-; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
-; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
-; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
-; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
-; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
-; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
-; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
-; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
-; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
-; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
-; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
-; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
-; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
-; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
-; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
-; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
-; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
-; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
-; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
-; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
-; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
-; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
-; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_setpc_b64 s[30:31]
entry:
br label %for.cond.preheader
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 51cd564..323514b 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -219,397 +219,349 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-LABEL: test_mfma_f32_32x32x1f32:
; GREEDY90A: ; %bb.0: ; %bb
; GREEDY90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0
+; GREEDY90A-NEXT: v_mov_b32_e32 v64, 1.0
+; GREEDY90A-NEXT: v_mov_b32_e32 v65, 2.0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s16
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s17
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s18
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s19
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, s20
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, s21
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, s22
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, s23
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, s24
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, s25
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, s26
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, s27
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, s28
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, s29
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, s30
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, s31
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a16, s0
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a17, s1
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s2
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s3
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s4
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s5
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s6
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s7
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s8
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s9
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s10
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s11
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s12
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s13
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s14
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s15
+; GREEDY90A-NEXT: v_mov_b32_e32 v0, s16
+; GREEDY90A-NEXT: v_mov_b32_e32 v1, s17
+; GREEDY90A-NEXT: v_mov_b32_e32 v2, s18
+; GREEDY90A-NEXT: v_mov_b32_e32 v3, s19
+; GREEDY90A-NEXT: v_mov_b32_e32 v4, s20
+; GREEDY90A-NEXT: v_mov_b32_e32 v5, s21
+; GREEDY90A-NEXT: v_mov_b32_e32 v6, s22
+; GREEDY90A-NEXT: v_mov_b32_e32 v7, s23
+; GREEDY90A-NEXT: v_mov_b32_e32 v8, s24
+; GREEDY90A-NEXT: v_mov_b32_e32 v9, s25
+; GREEDY90A-NEXT: v_mov_b32_e32 v10, s26
+; GREEDY90A-NEXT: v_mov_b32_e32 v11, s27
+; GREEDY90A-NEXT: v_mov_b32_e32 v12, s28
+; GREEDY90A-NEXT: v_mov_b32_e32 v13, s29
+; GREEDY90A-NEXT: v_mov_b32_e32 v14, s30
+; GREEDY90A-NEXT: v_mov_b32_e32 v15, s31
+; GREEDY90A-NEXT: v_mov_b32_e32 v16, s0
+; GREEDY90A-NEXT: v_mov_b32_e32 v17, s1
+; GREEDY90A-NEXT: v_mov_b32_e32 v18, s2
+; GREEDY90A-NEXT: v_mov_b32_e32 v19, s3
+; GREEDY90A-NEXT: v_mov_b32_e32 v20, s4
+; GREEDY90A-NEXT: v_mov_b32_e32 v21, s5
+; GREEDY90A-NEXT: v_mov_b32_e32 v22, s6
+; GREEDY90A-NEXT: v_mov_b32_e32 v23, s7
+; GREEDY90A-NEXT: v_mov_b32_e32 v24, s8
+; GREEDY90A-NEXT: v_mov_b32_e32 v25, s9
+; GREEDY90A-NEXT: v_mov_b32_e32 v26, s10
+; GREEDY90A-NEXT: v_mov_b32_e32 v27, s11
+; GREEDY90A-NEXT: v_mov_b32_e32 v28, s12
+; GREEDY90A-NEXT: v_mov_b32_e32 v29, s13
+; GREEDY90A-NEXT: v_mov_b32_e32 v30, s14
+; GREEDY90A-NEXT: v_mov_b32_e32 v31, s15
; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
+; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v64, v65, v[0:31]
+; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 v[32:63], v64, v65, v[0:31]
; GREEDY90A-NEXT: s_nop 15
; GREEDY90A-NEXT: s_nop 2
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a34
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a35
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a36
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a37
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a38
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a39
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a40
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a41
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a42
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a43
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a44
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a45
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a16, a46
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a17, a47
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a18, a48
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a19, a49
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a20, a50
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a21, a51
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a22, a52
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a23, a53
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a24, a54
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a25, a55
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a26, a56
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a27, a57
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a28, a58
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a29, a59
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a30, a60
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a31, a61
-; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GREEDY90A-NEXT: v_mov_b32_e32 v2, v32
+; GREEDY90A-NEXT: v_mov_b32_e32 v3, v33
+; GREEDY90A-NEXT: v_mov_b32_e32 v4, v34
+; GREEDY90A-NEXT: v_mov_b32_e32 v5, v35
+; GREEDY90A-NEXT: v_mov_b32_e32 v6, v36
+; GREEDY90A-NEXT: v_mov_b32_e32 v7, v37
+; GREEDY90A-NEXT: v_mov_b32_e32 v8, v38
+; GREEDY90A-NEXT: v_mov_b32_e32 v9, v39
+; GREEDY90A-NEXT: v_mov_b32_e32 v10, v40
+; GREEDY90A-NEXT: v_mov_b32_e32 v11, v41
+; GREEDY90A-NEXT: v_mov_b32_e32 v12, v42
+; GREEDY90A-NEXT: v_mov_b32_e32 v13, v43
+; GREEDY90A-NEXT: v_mov_b32_e32 v14, v44
+; GREEDY90A-NEXT: v_mov_b32_e32 v15, v45
+; GREEDY90A-NEXT: v_mov_b32_e32 v16, v46
+; GREEDY90A-NEXT: v_mov_b32_e32 v17, v47
+; GREEDY90A-NEXT: v_mov_b32_e32 v18, v48
+; GREEDY90A-NEXT: v_mov_b32_e32 v19, v49
+; GREEDY90A-NEXT: v_mov_b32_e32 v20, v50
+; GREEDY90A-NEXT: v_mov_b32_e32 v21, v51
+; GREEDY90A-NEXT: v_mov_b32_e32 v22, v52
+; GREEDY90A-NEXT: v_mov_b32_e32 v23, v53
+; GREEDY90A-NEXT: v_mov_b32_e32 v24, v54
+; GREEDY90A-NEXT: v_mov_b32_e32 v25, v55
+; GREEDY90A-NEXT: v_mov_b32_e32 v26, v56
+; GREEDY90A-NEXT: v_mov_b32_e32 v27, v57
+; GREEDY90A-NEXT: v_mov_b32_e32 v28, v58
+; GREEDY90A-NEXT: v_mov_b32_e32 v29, v59
+; GREEDY90A-NEXT: v_mov_b32_e32 v30, v60
+; GREEDY90A-NEXT: v_mov_b32_e32 v31, v61
+; GREEDY90A-NEXT: v_mov_b32_e32 v32, 0
+; GREEDY90A-NEXT: s_nop 0
+; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v64, v65, v[0:31]
; GREEDY90A-NEXT: s_nop 15
; GREEDY90A-NEXT: s_nop 2
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[16:19], s[34:35] offset:64
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[20:23], s[34:35] offset:80
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[34:35] offset:32
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[34:35] offset:48
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[34:35]
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[34:35] offset:16
+; GREEDY90A-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GREEDY90A-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GREEDY90A-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GREEDY90A-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GREEDY90A-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GREEDY90A-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GREEDY90A-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GREEDY90A-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
; GREEDY90A-NEXT: s_endpgm
;
; GREEDY942-LABEL: test_mfma_f32_32x32x1f32:
; GREEDY942: ; %bb.0: ; %bb
; GREEDY942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY942-NEXT: v_mov_b32_e32 v2, 0
+; GREEDY942-NEXT: v_mov_b32_e32 v64, 1.0
+; GREEDY942-NEXT: v_mov_b32_e32 v65, 2.0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s16
-; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s17
-; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s18
-; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s19
-; GREEDY942-NEXT: v_accvgpr_write_b32 a4, s20
-; GREEDY942-NEXT: v_accvgpr_write_b32 a5, s21
-; GREEDY942-NEXT: v_accvgpr_write_b32 a6, s22
-; GREEDY942-NEXT: v_accvgpr_write_b32 a7, s23
-; GREEDY942-NEXT: v_accvgpr_write_b32 a8, s24
-; GREEDY942-NEXT: v_accvgpr_write_b32 a9, s25
-; GREEDY942-NEXT: v_accvgpr_write_b32 a10, s26
-; GREEDY942-NEXT: v_accvgpr_write_b32 a11, s27
-; GREEDY942-NEXT: v_accvgpr_write_b32 a12, s28
-; GREEDY942-NEXT: v_accvgpr_write_b32 a13, s29
-; GREEDY942-NEXT: v_accvgpr_write_b32 a14, s30
-; GREEDY942-NEXT: v_accvgpr_write_b32 a15, s31
-; GREEDY942-NEXT: v_accvgpr_write_b32 a16, s0
-; GREEDY942-NEXT: v_accvgpr_write_b32 a17, s1
-; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s2
-; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s3
-; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s4
-; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s5
-; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s6
-; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s7
-; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s8
-; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s9
-; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s10
-; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s11
-; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s12
-; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s13
-; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s14
-; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s15
+; GREEDY942-NEXT: v_mov_b32_e32 v0, s16
+; GREEDY942-NEXT: v_mov_b32_e32 v1, s17
+; GREEDY942-NEXT: v_mov_b32_e32 v2, s18
+; GREEDY942-NEXT: v_mov_b32_e32 v3, s19
+; GREEDY942-NEXT: v_mov_b32_e32 v4, s20
+; GREEDY942-NEXT: v_mov_b32_e32 v5, s21
+; GREEDY942-NEXT: v_mov_b32_e32 v6, s22
+; GREEDY942-NEXT: v_mov_b32_e32 v7, s23
+; GREEDY942-NEXT: v_mov_b32_e32 v8, s24
+; GREEDY942-NEXT: v_mov_b32_e32 v9, s25
+; GREEDY942-NEXT: v_mov_b32_e32 v10, s26
+; GREEDY942-NEXT: v_mov_b32_e32 v11, s27
+; GREEDY942-NEXT: v_mov_b32_e32 v12, s28
+; GREEDY942-NEXT: v_mov_b32_e32 v13, s29
+; GREEDY942-NEXT: v_mov_b32_e32 v14, s30
+; GREEDY942-NEXT: v_mov_b32_e32 v15, s31
+; GREEDY942-NEXT: v_mov_b32_e32 v16, s0
+; GREEDY942-NEXT: v_mov_b32_e32 v17, s1
+; GREEDY942-NEXT: v_mov_b32_e32 v18, s2
+; GREEDY942-NEXT: v_mov_b32_e32 v19, s3
+; GREEDY942-NEXT: v_mov_b32_e32 v20, s4
+; GREEDY942-NEXT: v_mov_b32_e32 v21, s5
+; GREEDY942-NEXT: v_mov_b32_e32 v22, s6
+; GREEDY942-NEXT: v_mov_b32_e32 v23, s7
+; GREEDY942-NEXT: v_mov_b32_e32 v24, s8
+; GREEDY942-NEXT: v_mov_b32_e32 v25, s9
+; GREEDY942-NEXT: v_mov_b32_e32 v26, s10
+; GREEDY942-NEXT: v_mov_b32_e32 v27, s11
+; GREEDY942-NEXT: v_mov_b32_e32 v28, s12
+; GREEDY942-NEXT: v_mov_b32_e32 v29, s13
+; GREEDY942-NEXT: v_mov_b32_e32 v30, s14
+; GREEDY942-NEXT: v_mov_b32_e32 v31, s15
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
-; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[32:63], v0, v1, a[0:31]
+; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v64, v65, v[0:31]
+; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[32:63], v64, v65, v[0:31]
; GREEDY942-NEXT: s_nop 15
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a34
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a35
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a36
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a37
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a38
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a39
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a40
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a41
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a42
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a43
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a44
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a45
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a16, a46
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a17, a47
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a18, a48
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a19, a49
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a20, a50
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a21, a51
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a22, a52
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a23, a53
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a24, a54
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a25, a55
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a26, a56
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a27, a57
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a28, a58
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a29, a59
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a30, a60
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a31, a61
-; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GREEDY942-NEXT: v_mov_b32_e32 v2, v32
+; GREEDY942-NEXT: v_mov_b32_e32 v3, v33
+; GREEDY942-NEXT: v_mov_b32_e32 v4, v34
+; GREEDY942-NEXT: v_mov_b32_e32 v5, v35
+; GREEDY942-NEXT: v_mov_b32_e32 v6, v36
+; GREEDY942-NEXT: v_mov_b32_e32 v7, v37
+; GREEDY942-NEXT: v_mov_b32_e32 v8, v38
+; GREEDY942-NEXT: v_mov_b32_e32 v9, v39
+; GREEDY942-NEXT: v_mov_b32_e32 v10, v40
+; GREEDY942-NEXT: v_mov_b32_e32 v11, v41
+; GREEDY942-NEXT: v_mov_b32_e32 v12, v42
+; GREEDY942-NEXT: v_mov_b32_e32 v13, v43
+; GREEDY942-NEXT: v_mov_b32_e32 v14, v44
+; GREEDY942-NEXT: v_mov_b32_e32 v15, v45
+; GREEDY942-NEXT: v_mov_b32_e32 v16, v46
+; GREEDY942-NEXT: v_mov_b32_e32 v17, v47
+; GREEDY942-NEXT: v_mov_b32_e32 v18, v48
+; GREEDY942-NEXT: v_mov_b32_e32 v19, v49
+; GREEDY942-NEXT: v_mov_b32_e32 v20, v50
+; GREEDY942-NEXT: v_mov_b32_e32 v21, v51
+; GREEDY942-NEXT: v_mov_b32_e32 v22, v52
+; GREEDY942-NEXT: v_mov_b32_e32 v23, v53
+; GREEDY942-NEXT: v_mov_b32_e32 v24, v54
+; GREEDY942-NEXT: v_mov_b32_e32 v25, v55
+; GREEDY942-NEXT: v_mov_b32_e32 v26, v56
+; GREEDY942-NEXT: v_mov_b32_e32 v27, v57
+; GREEDY942-NEXT: v_mov_b32_e32 v28, v58
+; GREEDY942-NEXT: v_mov_b32_e32 v29, v59
+; GREEDY942-NEXT: v_mov_b32_e32 v30, v60
+; GREEDY942-NEXT: v_mov_b32_e32 v31, v61
+; GREEDY942-NEXT: v_mov_b32_e32 v32, 0
+; GREEDY942-NEXT: s_nop 0
+; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v64, v65, v[0:31]
; GREEDY942-NEXT: s_nop 15
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[16:19], s[34:35] offset:64
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[20:23], s[34:35] offset:80
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[34:35] offset:32
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[34:35] offset:48
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[34:35]
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[34:35] offset:16
+; GREEDY942-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GREEDY942-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
+; GREEDY942-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GREEDY942-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GREEDY942-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GREEDY942-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GREEDY942-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GREEDY942-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
; GREEDY942-NEXT: s_endpgm
;
; GREEDY90A-GISEL-LABEL: test_mfma_f32_32x32x1f32:
; GREEDY90A-GISEL: ; %bb.0: ; %bb
; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
-; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v64, 1.0
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v65, 2.0
; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a16, s16
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a17, s17
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a18, s18
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a19, s19
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a20, s20
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a21, s21
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a22, s22
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a23, s23
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a24, s24
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a25, s25
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a26, s26
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a27, s27
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a28, s28
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a29, s29
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a30, s30
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a31, s31
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[16:17], s[16:17], s[16:17] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[18:19], s[18:19], s[18:19] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[20:21], s[20:21], s[20:21] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[22:23], s[22:23], s[22:23] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[24:25], s[24:25], s[24:25] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[26:27], s[26:27], s[26:27] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[28:29], s[28:29], s[28:29] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[30:31], s[30:31], s[30:31] op_sel:[0,1]
; GREEDY90A-GISEL-NEXT: s_nop 1
-; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31]
+; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v64, v65, v[0:31]
+; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 v[32:63], v64, v65, v[0:31]
; GREEDY90A-GISEL-NEXT: s_nop 15
; GREEDY90A-GISEL-NEXT: s_nop 2
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a32
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a33
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a34
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a5, a35
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a6, a36
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a7, a37
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a8, a38
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a9, a39
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a10, a40
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a11, a41
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a12, a42
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a13, a43
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a14, a44
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a15, a45
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a16, a46
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a17, a47
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a18, a48
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a19, a49
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a20, a50
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a21, a51
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a22, a52
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a23, a53
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a24, a54
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a25, a55
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a26, a56
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a27, a57
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a28, a58
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a29, a59
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a30, a60
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a31, a61
-; GREEDY90A-GISEL-NEXT: s_nop 1
-; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
-; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v2, v32
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v3, v33
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v4, v34
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v5, v35
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v6, v36
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v7, v37
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v8, v38
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v9, v39
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v10, v40
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v11, v41
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v12, v42
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v13, v43
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v14, v44
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v15, v45
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v16, v46
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v17, v47
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v18, v48
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v19, v49
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v20, v50
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v21, v51
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v22, v52
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v23, v53
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v24, v54
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v25, v55
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v26, v56
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v27, v57
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v28, v58
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v29, v59
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v30, v60
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v31, v61
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v32, 0
+; GREEDY90A-GISEL-NEXT: s_nop 0
+; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 v[0:31], v64, v65, v[0:31]
; GREEDY90A-GISEL-NEXT: s_nop 15
-; GREEDY90A-GISEL-NEXT: s_nop 1
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35]
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112
+; GREEDY90A-GISEL-NEXT: s_nop 2
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35]
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112
; GREEDY90A-GISEL-NEXT: s_endpgm
;
; FAST90A-LABEL: test_mfma_f32_32x32x1f32:
; FAST90A: ; %bb.0: ; %bb
; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0
-; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0
+; FAST90A-NEXT: v_mov_b32_e32 v34, 2.0
; FAST90A-NEXT: v_mov_b32_e32 v0, 0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
-; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36
-; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37
-; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38
-; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39
-; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40
-; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41
-; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42
-; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43
-; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44
-; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45
-; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46
-; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47
-; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48
-; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49
-; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50
-; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51
-; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4
-; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5
-; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6
-; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7
-; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8
-; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9
-; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10
-; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11
-; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12
-; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13
-; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14
-; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15
-; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16
-; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17
-; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18
-; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19
+; FAST90A-NEXT: v_mov_b32_e32 v2, s36
+; FAST90A-NEXT: v_mov_b32_e32 v3, s37
+; FAST90A-NEXT: v_mov_b32_e32 v4, s38
+; FAST90A-NEXT: v_mov_b32_e32 v5, s39
+; FAST90A-NEXT: v_mov_b32_e32 v6, s40
+; FAST90A-NEXT: v_mov_b32_e32 v7, s41
+; FAST90A-NEXT: v_mov_b32_e32 v8, s42
+; FAST90A-NEXT: v_mov_b32_e32 v9, s43
+; FAST90A-NEXT: v_mov_b32_e32 v10, s44
+; FAST90A-NEXT: v_mov_b32_e32 v11, s45
+; FAST90A-NEXT: v_mov_b32_e32 v12, s46
+; FAST90A-NEXT: v_mov_b32_e32 v13, s47
+; FAST90A-NEXT: v_mov_b32_e32 v14, s48
+; FAST90A-NEXT: v_mov_b32_e32 v15, s49
+; FAST90A-NEXT: v_mov_b32_e32 v16, s50
+; FAST90A-NEXT: v_mov_b32_e32 v17, s51
+; FAST90A-NEXT: v_mov_b32_e32 v18, s4
+; FAST90A-NEXT: v_mov_b32_e32 v19, s5
+; FAST90A-NEXT: v_mov_b32_e32 v20, s6
+; FAST90A-NEXT: v_mov_b32_e32 v21, s7
+; FAST90A-NEXT: v_mov_b32_e32 v22, s8
+; FAST90A-NEXT: v_mov_b32_e32 v23, s9
+; FAST90A-NEXT: v_mov_b32_e32 v24, s10
+; FAST90A-NEXT: v_mov_b32_e32 v25, s11
+; FAST90A-NEXT: v_mov_b32_e32 v26, s12
+; FAST90A-NEXT: v_mov_b32_e32 v27, s13
+; FAST90A-NEXT: v_mov_b32_e32 v28, s14
+; FAST90A-NEXT: v_mov_b32_e32 v29, s15
+; FAST90A-NEXT: v_mov_b32_e32 v30, s16
+; FAST90A-NEXT: v_mov_b32_e32 v31, s17
+; FAST90A-NEXT: v_mov_b32_e32 v32, s18
+; FAST90A-NEXT: v_mov_b32_e32 v33, s19
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63]
+; FAST90A-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v1, v34, v[2:33]
+; FAST90A-NEXT: v_mfma_f32_32x32x1f32 v[36:67], v1, v34, v[2:33]
; FAST90A-NEXT: s_nop 15
; FAST90A-NEXT: s_nop 2
-; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29
-; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28
-; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27
-; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26
-; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25
-; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24
-; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23
-; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22
-; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21
-; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20
-; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19
-; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18
-; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17
-; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16
-; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15
-; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14
-; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13
-; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12
-; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11
-; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10
-; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9
-; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8
-; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7
-; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6
-; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5
-; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4
-; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3
-; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2
-; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1
-; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0
-; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32
-; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33
-; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32
-; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31
-; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30
-; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29
-; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28
-; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27
-; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26
-; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25
-; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24
-; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23
-; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22
-; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21
-; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20
-; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19
-; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18
-; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17
-; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16
-; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15
-; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14
-; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13
-; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12
-; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11
-; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10
-; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9
-; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8
-; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7
-; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6
-; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5
-; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4
-; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3
+; FAST90A-NEXT: v_mov_b32_e32 v4, v36
+; FAST90A-NEXT: v_mov_b32_e32 v5, v37
+; FAST90A-NEXT: v_mov_b32_e32 v6, v38
+; FAST90A-NEXT: v_mov_b32_e32 v7, v39
+; FAST90A-NEXT: v_mov_b32_e32 v8, v40
+; FAST90A-NEXT: v_mov_b32_e32 v9, v41
+; FAST90A-NEXT: v_mov_b32_e32 v10, v42
+; FAST90A-NEXT: v_mov_b32_e32 v11, v43
+; FAST90A-NEXT: v_mov_b32_e32 v12, v44
+; FAST90A-NEXT: v_mov_b32_e32 v13, v45
+; FAST90A-NEXT: v_mov_b32_e32 v14, v46
+; FAST90A-NEXT: v_mov_b32_e32 v15, v47
+; FAST90A-NEXT: v_mov_b32_e32 v16, v48
+; FAST90A-NEXT: v_mov_b32_e32 v17, v49
+; FAST90A-NEXT: v_mov_b32_e32 v18, v50
+; FAST90A-NEXT: v_mov_b32_e32 v19, v51
+; FAST90A-NEXT: v_mov_b32_e32 v20, v52
+; FAST90A-NEXT: v_mov_b32_e32 v21, v53
+; FAST90A-NEXT: v_mov_b32_e32 v22, v54
+; FAST90A-NEXT: v_mov_b32_e32 v23, v55
+; FAST90A-NEXT: v_mov_b32_e32 v24, v56
+; FAST90A-NEXT: v_mov_b32_e32 v25, v57
+; FAST90A-NEXT: v_mov_b32_e32 v26, v58
+; FAST90A-NEXT: v_mov_b32_e32 v27, v59
+; FAST90A-NEXT: v_mov_b32_e32 v28, v60
+; FAST90A-NEXT: v_mov_b32_e32 v29, v61
+; FAST90A-NEXT: v_mov_b32_e32 v30, v62
+; FAST90A-NEXT: v_mov_b32_e32 v31, v63
+; FAST90A-NEXT: v_mov_b32_e32 v32, v64
+; FAST90A-NEXT: v_mov_b32_e32 v33, v65
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
+; FAST90A-NEXT: v_mfma_f32_32x32x1f32 v[2:33], v1, v34, v[2:33]
; FAST90A-NEXT: s_nop 15
; FAST90A-NEXT: s_nop 2
-; FAST90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
-; FAST90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
-; FAST90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
-; FAST90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
-; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; FAST90A-NEXT: global_store_dwordx4 v0, v[26:29], s[0:1] offset:96
+; FAST90A-NEXT: global_store_dwordx4 v0, v[30:33], s[0:1] offset:112
+; FAST90A-NEXT: global_store_dwordx4 v0, v[18:21], s[0:1] offset:64
+; FAST90A-NEXT: global_store_dwordx4 v0, v[22:25], s[0:1] offset:80
+; FAST90A-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; FAST90A-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; FAST90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
+; FAST90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
; FAST90A-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -707,185 +659,177 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32:
; GREEDY90A: ; %bb.0: ; %bb
; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0
+; GREEDY90A-NEXT: v_mov_b32_e32 v32, 1.0
+; GREEDY90A-NEXT: v_mov_b32_e32 v33, 2.0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0
-; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
-; GREEDY90A-NEXT: s_nop 9
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19
+; GREEDY90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GREEDY90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GREEDY90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GREEDY90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GREEDY90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GREEDY90A-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GREEDY90A-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GREEDY90A-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v32, v33, v[0:15]
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 v[16:31], v32, v33, v[0:15]
+; GREEDY90A-NEXT: s_nop 10
+; GREEDY90A-NEXT: v_mov_b32_e32 v2, v16
+; GREEDY90A-NEXT: v_mov_b32_e32 v3, v17
+; GREEDY90A-NEXT: v_mov_b32_e32 v4, v18
+; GREEDY90A-NEXT: v_mov_b32_e32 v5, v19
+; GREEDY90A-NEXT: v_mov_b32_e32 v6, v20
+; GREEDY90A-NEXT: v_mov_b32_e32 v7, v21
+; GREEDY90A-NEXT: v_mov_b32_e32 v8, v22
+; GREEDY90A-NEXT: v_mov_b32_e32 v9, v23
+; GREEDY90A-NEXT: v_mov_b32_e32 v10, v24
+; GREEDY90A-NEXT: v_mov_b32_e32 v11, v25
+; GREEDY90A-NEXT: v_mov_b32_e32 v12, v26
+; GREEDY90A-NEXT: v_mov_b32_e32 v13, v27
+; GREEDY90A-NEXT: v_mov_b32_e32 v14, v28
+; GREEDY90A-NEXT: v_mov_b32_e32 v15, v29
+; GREEDY90A-NEXT: v_mov_b32_e32 v16, 0
+; GREEDY90A-NEXT: s_nop 0
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v32, v33, v[0:15]
; GREEDY90A-NEXT: s_nop 10
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17]
+; GREEDY90A-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GREEDY90A-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GREEDY90A-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GREEDY90A-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GREEDY90A-NEXT: s_endpgm
;
; GREEDY942-LABEL: test_mfma_f32_16x16x1f32:
; GREEDY942: ; %bb.0: ; %bb
; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY942-NEXT: v_mov_b32_e32 v2, 0
+; GREEDY942-NEXT: v_mov_b32_e32 v32, 1.0
+; GREEDY942-NEXT: v_mov_b32_e32 v33, 2.0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15
-; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14
-; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13
-; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12
-; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11
-; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10
-; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9
-; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8
-; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7
-; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6
-; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5
-; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4
-; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3
-; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2
-; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1
-; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0
+; GREEDY942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GREEDY942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GREEDY942-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GREEDY942-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GREEDY942-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
+; GREEDY942-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
+; GREEDY942-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
+; GREEDY942-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
-; GREEDY942-NEXT: s_nop 8
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19
-; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v32, v33, v[0:15]
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[16:31], v32, v33, v[0:15]
+; GREEDY942-NEXT: s_nop 9
+; GREEDY942-NEXT: v_mov_b32_e32 v2, v16
+; GREEDY942-NEXT: v_mov_b32_e32 v3, v17
+; GREEDY942-NEXT: v_mov_b32_e32 v4, v18
+; GREEDY942-NEXT: v_mov_b32_e32 v5, v19
+; GREEDY942-NEXT: v_mov_b32_e32 v6, v20
+; GREEDY942-NEXT: v_mov_b32_e32 v7, v21
+; GREEDY942-NEXT: v_mov_b32_e32 v8, v22
+; GREEDY942-NEXT: v_mov_b32_e32 v9, v23
+; GREEDY942-NEXT: v_mov_b32_e32 v10, v24
+; GREEDY942-NEXT: v_mov_b32_e32 v11, v25
+; GREEDY942-NEXT: v_mov_b32_e32 v12, v26
+; GREEDY942-NEXT: v_mov_b32_e32 v13, v27
+; GREEDY942-NEXT: v_mov_b32_e32 v14, v28
+; GREEDY942-NEXT: v_mov_b32_e32 v15, v29
+; GREEDY942-NEXT: v_mov_b32_e32 v16, 0
+; GREEDY942-NEXT: s_nop 0
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v32, v33, v[0:15]
; GREEDY942-NEXT: s_nop 9
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17]
+; GREEDY942-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
+; GREEDY942-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GREEDY942-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GREEDY942-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
; GREEDY942-NEXT: s_endpgm
;
; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32:
; GREEDY90A-GISEL: ; %bb.0: ; %bb
; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v32, 1.0
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v33, 2.0
; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a4, s4
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a5, s5
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a6, s6
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a7, s7
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a8, s8
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a9, s9
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a10, s10
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a11, s11
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a12, s12
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[10:11], s[10:11], s[10:11] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[12:13], s[12:13], s[12:13] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[14:15], s[14:15], s[14:15] op_sel:[0,1]
; GREEDY90A-GISEL-NEXT: s_nop 1
-; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15]
+; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v32, v33, v[0:15]
+; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 v[16:31], v32, v33, v[0:15]
; GREEDY90A-GISEL-NEXT: s_nop 10
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a16
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a17
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a18
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a5, a19
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a6, a20
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a7, a21
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a8, a22
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a9, a23
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a10, a24
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a11, a25
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a12, a26
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a13, a27
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a14, a28
-; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a15, a29
-; GREEDY90A-GISEL-NEXT: s_nop 1
-; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
-; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GREEDY90A-GISEL-NEXT: s_nop 9
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v2, v16
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v3, v17
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v4, v18
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v5, v19
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v6, v20
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v7, v21
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v8, v22
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v9, v23
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v10, v24
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v11, v25
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v12, v26
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v13, v27
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v14, v28
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v15, v29
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v16, 0
+; GREEDY90A-GISEL-NEXT: s_nop 0
+; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 v[0:15], v32, v33, v[0:15]
+; GREEDY90A-GISEL-NEXT: s_nop 10
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
; GREEDY90A-GISEL-NEXT: s_endpgm
;
; FAST90A-LABEL: test_mfma_f32_16x16x1f32:
; FAST90A: ; %bb.0: ; %bb
; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0
-; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0
-; FAST90A-NEXT: v_mov_b32_e32 v0, 0
+; FAST90A-NEXT: v_mov_b32_e32 v0, 1.0
+; FAST90A-NEXT: v_mov_b32_e32 v1, 2.0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
-; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4
-; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5
-; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6
-; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7
-; FAST90A-NEXT: v_accvgpr_write_b32 a4, s8
-; FAST90A-NEXT: v_accvgpr_write_b32 a5, s9
-; FAST90A-NEXT: v_accvgpr_write_b32 a6, s10
-; FAST90A-NEXT: v_accvgpr_write_b32 a7, s11
-; FAST90A-NEXT: v_accvgpr_write_b32 a8, s12
-; FAST90A-NEXT: v_accvgpr_write_b32 a9, s13
-; FAST90A-NEXT: v_accvgpr_write_b32 a10, s14
-; FAST90A-NEXT: v_accvgpr_write_b32 a11, s15
-; FAST90A-NEXT: v_accvgpr_write_b32 a12, s16
-; FAST90A-NEXT: v_accvgpr_write_b32 a13, s17
-; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18
-; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19
+; FAST90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; FAST90A-NEXT: v_pk_mov_b32 v[4:5], s[6:7], s[6:7] op_sel:[0,1]
+; FAST90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
+; FAST90A-NEXT: v_pk_mov_b32 v[8:9], s[10:11], s[10:11] op_sel:[0,1]
+; FAST90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
+; FAST90A-NEXT: v_pk_mov_b32 v[12:13], s[14:15], s[14:15] op_sel:[0,1]
+; FAST90A-NEXT: v_pk_mov_b32 v[14:15], s[16:17], s[16:17] op_sel:[0,1]
+; FAST90A-NEXT: v_pk_mov_b32 v[16:17], s[18:19], s[18:19] op_sel:[0,1]
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 v[2:17], v0, v1, v[2:17]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 v[18:33], v0, v1, v[2:17]
; FAST90A-NEXT: s_nop 10
-; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16
-; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17
-; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18
-; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a19
-; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a20
-; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a21
-; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a22
-; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a23
-; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a24
-; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a25
-; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a26
-; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a27
-; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28
-; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29
+; FAST90A-NEXT: v_mov_b32_e32 v4, v18
+; FAST90A-NEXT: v_mov_b32_e32 v5, v19
+; FAST90A-NEXT: v_mov_b32_e32 v6, v20
+; FAST90A-NEXT: v_mov_b32_e32 v7, v21
+; FAST90A-NEXT: v_mov_b32_e32 v8, v22
+; FAST90A-NEXT: v_mov_b32_e32 v9, v23
+; FAST90A-NEXT: v_mov_b32_e32 v10, v24
+; FAST90A-NEXT: v_mov_b32_e32 v11, v25
+; FAST90A-NEXT: v_mov_b32_e32 v12, v26
+; FAST90A-NEXT: v_mov_b32_e32 v13, v27
+; FAST90A-NEXT: v_mov_b32_e32 v14, v28
+; FAST90A-NEXT: v_mov_b32_e32 v15, v29
+; FAST90A-NEXT: v_mov_b32_e32 v16, v30
+; FAST90A-NEXT: v_mov_b32_e32 v17, v31
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
-; FAST90A-NEXT: s_nop 10
-; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 v[2:17], v0, v1, v[2:17]
+; FAST90A-NEXT: v_mov_b32_e32 v0, 0
+; FAST90A-NEXT: s_nop 9
+; FAST90A-NEXT: global_store_dwordx4 v0, v[14:17], s[0:1] offset:48
+; FAST90A-NEXT: global_store_dwordx4 v0, v[10:13], s[0:1] offset:32
+; FAST90A-NEXT: global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; FAST90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; FAST90A-NEXT: s_endpgm
bb:
%in.1 = load <16 x float>, ptr addrspace(1) %arg
@@ -934,68 +878,63 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-LABEL: test_mfma_f32_4x4x1f32:
; GREEDY90A: ; %bb.0: ; %bb
; GREEDY90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0
+; GREEDY90A-NEXT: v_mov_b32_e32 v6, 1.0
+; GREEDY90A-NEXT: v_mov_b32_e32 v7, 2.0
+; GREEDY90A-NEXT: v_mov_b32_e32 v8, 0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s0
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s1
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3
+; GREEDY90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GREEDY90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3]
-; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3]
+; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v6, v7, v[0:3]
+; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 v[2:5], v6, v7, v[0:3]
; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3]
+; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v6, v7, v[0:3]
; GREEDY90A-NEXT: s_nop 4
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7]
+; GREEDY90A-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GREEDY90A-NEXT: s_endpgm
;
; GREEDY942-LABEL: test_mfma_f32_4x4x1f32:
; GREEDY942: ; %bb.0: ; %bb
; GREEDY942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY942-NEXT: v_mov_b32_e32 v2, 0
+; GREEDY942-NEXT: v_mov_b32_e32 v6, 1.0
+; GREEDY942-NEXT: v_mov_b32_e32 v7, 2.0
+; GREEDY942-NEXT: v_mov_b32_e32 v8, 0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s0
-; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s1
-; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2
-; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3
+; GREEDY942-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GREEDY942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3]
+; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3]
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[2:5], v0, v1, a[0:3]
+; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[2:5], v6, v7, v[0:3]
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3]
+; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v6, v7, v[0:3]
; GREEDY942-NEXT: s_nop 3
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7]
+; GREEDY942-NEXT: global_store_dwordx4 v8, v[0:3], s[6:7]
; GREEDY942-NEXT: s_endpgm
;
; GREEDY90A-GISEL-LABEL: test_mfma_f32_4x4x1f32:
; GREEDY90A-GISEL: ; %bb.0: ; %bb
; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v6, 1.0
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v7, 2.0
; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GREEDY90A-GISEL-NEXT: s_nop 1
+; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v6, v7, v[0:3]
+; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 v[2:5], v6, v7, v[0:3]
+; GREEDY90A-GISEL-NEXT: s_nop 1
+; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 v[0:3], v6, v7, v[0:3]
; GREEDY90A-GISEL-NEXT: s_nop 1
-; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3]
-; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3]
+; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v4, 0
; GREEDY90A-GISEL-NEXT: s_nop 1
-; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3]
-; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GREEDY90A-GISEL-NEXT: s_nop 3
-; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GREEDY90A-GISEL-NEXT: s_endpgm
;
; FAST90A-LABEL: test_mfma_f32_4x4x1f32:
@@ -1007,20 +946,18 @@ define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
; FAST90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
-; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4
-; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5
-; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6
-; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7
+; FAST90A-NEXT: v_pk_mov_b32 v[4:5], s[4:5], s[4:5] op_sel:[0,1]
+; FAST90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3]
-; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v1, v2, a[0:3]
+; FAST90A-NEXT: v_mfma_f32_4x4x1f32 v[4:7], v1, v2, v[4:7]
+; FAST90A-NEXT: v_mfma_f32_4x4x1f32 v[8:11], v1, v2, v[4:7]
; FAST90A-NEXT: s_nop 4
-; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a4
-; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a5
+; FAST90A-NEXT: v_mov_b32_e32 v6, v8
+; FAST90A-NEXT: v_mov_b32_e32 v7, v9
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3]
+; FAST90A-NEXT: v_mfma_f32_4x4x1f32 v[2:5], v1, v2, v[4:7]
; FAST90A-NEXT: s_nop 4
-; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; FAST90A-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1]
; FAST90A-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
index cf244f0..5adb0cb 100644
--- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll
@@ -6,36 +6,36 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) {
; GFX942-LABEL: matmul_kernel:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
; GFX942-NEXT: s_mov_b32 s2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT: s_mov_b32 s3, 0
+; GFX942-NEXT: v_mov_b32_e32 v0, v1
+; GFX942-NEXT: s_mov_b32 s6, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
+; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v2
; GFX942-NEXT: s_branch .LBB0_2
; GFX942-NEXT: .LBB0_1: ; %bb2
; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1
-; GFX942-NEXT: s_or_b32 s4, s3, 1
-; GFX942-NEXT: s_ashr_i32 s5, s3, 31
; GFX942-NEXT: s_mov_b32 s3, s2
-; GFX942-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX942-NEXT: v_accvgpr_mov_b32 a0, a2
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1
-; GFX942-NEXT: s_and_b32 s3, s5, s4
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[2:5], v[2:3], v[2:3], a[0:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: v_mov_b32_e32 v3, v1
+; GFX942-NEXT: s_or_b32 s4, s6, 1
+; GFX942-NEXT: s_ashr_i32 s3, s6, 31
+; GFX942-NEXT: v_mfma_f32_16x16x16_f16 v[2:5], v[4:5], v[4:5], v[0:3]
+; GFX942-NEXT: s_and_b32 s6, s3, s4
+; GFX942-NEXT: s_nop 5
+; GFX942-NEXT: v_mov_b32_e32 v0, v2
; GFX942-NEXT: s_cbranch_execz .LBB0_4
; GFX942-NEXT: .LBB0_2: ; %bb
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
; GFX942-NEXT: s_cbranch_vccz .LBB0_1
; GFX942-NEXT: ; %bb.3:
-; GFX942-NEXT: ; implicit-def: $sgpr3
-; GFX942-NEXT: ; implicit-def: $agpr2
+; GFX942-NEXT: ; implicit-def: $sgpr6
; GFX942-NEXT: .LBB0_4: ; %common.ret
; GFX942-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 6509d80..efdb7f1 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX908 %s
;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX908 %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
-;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX90A %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 --stop-after=greedy,2 < %s | FileCheck -check-prefix=REGALLOC-GFX90A %s
+;RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 --stop-after=prologepilog < %s | FileCheck -check-prefix=PEI-GFX90A %s
; Partial reg copy and spill missed during regalloc handled later at frame lowering.
define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index da48af1..02aff39 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form=0 < %s | FileCheck -check-prefixes=GCN,GFX90A %s
define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
; GFX908-LABEL: max_12regs_13a_used:
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b045c76..0a379ba 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -872,15 +872,13 @@ define amdgpu_kernel void @v8i8_mfma_i8(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[14:15], 0x0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
+; GFX942-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; GFX942-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_i32_16x16x32_i8 a[0:3], v[2:3], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_i32_16x16x32_i8 v[2:5], v[2:3], v[2:3], v[4:7] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 6
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
+; GFX942-NEXT: global_store_dwordx4 v0, v[2:5], s[12:13]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -908,66 +906,66 @@ define amdgpu_kernel void @v8i8_mfma_half(ptr addrspace(1) %src1, ptr addrspace(
; GFX942-LABEL: v8i8_mfma_half:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dwordx8 s[36:43], s[4:5], 0x24
-; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX942-NEXT: v_lshlrev_b32_e32 v1, 3, v4
-; GFX942-NEXT: v_mov_b32_e32 v0, 0
-; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4
+; GFX942-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 3, v1
+; GFX942-NEXT: v_mov_b32_e32 v32, 0
+; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v1
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[36:37]
+; GFX942-NEXT: global_load_dwordx2 v[34:35], v0, s[36:37]
; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX942-NEXT: s_cbranch_execz .LBB15_2
; GFX942-NEXT: ; %bb.1: ; %bb.1
-; GFX942-NEXT: global_load_dwordx2 v[2:3], v1, s[38:39]
+; GFX942-NEXT: global_load_dwordx2 v[34:35], v0, s[38:39]
; GFX942-NEXT: .LBB15_2: ; %bb.2
; GFX942-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX942-NEXT: s_load_dwordx16 s[16:31], s[42:43], 0x0
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[42:43], 0x40
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a0, s16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, s17
-; GFX942-NEXT: v_accvgpr_write_b32 a2, s18
-; GFX942-NEXT: v_accvgpr_write_b32 a3, s19
-; GFX942-NEXT: v_accvgpr_write_b32 a4, s20
-; GFX942-NEXT: v_accvgpr_write_b32 a5, s21
-; GFX942-NEXT: v_accvgpr_write_b32 a6, s22
-; GFX942-NEXT: v_accvgpr_write_b32 a7, s23
-; GFX942-NEXT: v_accvgpr_write_b32 a8, s24
-; GFX942-NEXT: v_accvgpr_write_b32 a9, s25
-; GFX942-NEXT: v_accvgpr_write_b32 a10, s26
-; GFX942-NEXT: v_accvgpr_write_b32 a11, s27
-; GFX942-NEXT: v_accvgpr_write_b32 a12, s28
-; GFX942-NEXT: v_accvgpr_write_b32 a13, s29
-; GFX942-NEXT: v_accvgpr_write_b32 a14, s30
-; GFX942-NEXT: v_accvgpr_write_b32 a15, s31
-; GFX942-NEXT: v_accvgpr_write_b32 a16, s0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a18, s2
-; GFX942-NEXT: v_accvgpr_write_b32 a19, s3
-; GFX942-NEXT: v_accvgpr_write_b32 a20, s4
-; GFX942-NEXT: v_accvgpr_write_b32 a21, s5
-; GFX942-NEXT: v_accvgpr_write_b32 a22, s6
-; GFX942-NEXT: v_accvgpr_write_b32 a23, s7
-; GFX942-NEXT: v_accvgpr_write_b32 a24, s8
-; GFX942-NEXT: v_accvgpr_write_b32 a25, s9
-; GFX942-NEXT: v_accvgpr_write_b32 a26, s10
-; GFX942-NEXT: v_accvgpr_write_b32 a27, s11
-; GFX942-NEXT: v_accvgpr_write_b32 a28, s12
-; GFX942-NEXT: v_accvgpr_write_b32 a29, s13
-; GFX942-NEXT: v_accvgpr_write_b32 a30, s14
-; GFX942-NEXT: v_accvgpr_write_b32 a31, s15
+; GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-NEXT: v_mov_b32_e32 v4, s20
+; GFX942-NEXT: v_mov_b32_e32 v5, s21
+; GFX942-NEXT: v_mov_b32_e32 v6, s22
+; GFX942-NEXT: v_mov_b32_e32 v7, s23
+; GFX942-NEXT: v_mov_b32_e32 v8, s24
+; GFX942-NEXT: v_mov_b32_e32 v9, s25
+; GFX942-NEXT: v_mov_b32_e32 v10, s26
+; GFX942-NEXT: v_mov_b32_e32 v11, s27
+; GFX942-NEXT: v_mov_b32_e32 v12, s28
+; GFX942-NEXT: v_mov_b32_e32 v13, s29
+; GFX942-NEXT: v_mov_b32_e32 v14, s30
+; GFX942-NEXT: v_mov_b32_e32 v15, s31
+; GFX942-NEXT: v_mov_b32_e32 v16, s0
+; GFX942-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-NEXT: v_mov_b32_e32 v18, s2
+; GFX942-NEXT: v_mov_b32_e32 v19, s3
+; GFX942-NEXT: v_mov_b32_e32 v20, s4
+; GFX942-NEXT: v_mov_b32_e32 v21, s5
+; GFX942-NEXT: v_mov_b32_e32 v22, s6
+; GFX942-NEXT: v_mov_b32_e32 v23, s7
+; GFX942-NEXT: v_mov_b32_e32 v24, s8
+; GFX942-NEXT: v_mov_b32_e32 v25, s9
+; GFX942-NEXT: v_mov_b32_e32 v26, s10
+; GFX942-NEXT: v_mov_b32_e32 v27, s11
+; GFX942-NEXT: v_mov_b32_e32 v28, s12
+; GFX942-NEXT: v_mov_b32_e32 v29, s13
+; GFX942-NEXT: v_mov_b32_e32 v30, s14
+; GFX942-NEXT: v_mov_b32_e32 v31, s15
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_nop 0
-; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[2:3], a[0:31] cbsz:1 abid:2 blgp:3
+; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[34:35], v[0:31] cbsz:1 abid:2 blgp:3
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 2
-; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[40:41] offset:112
-; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[40:41] offset:96
-; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[40:41] offset:80
-; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[40:41] offset:64
-; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[40:41] offset:48
-; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[40:41] offset:32
-; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[40:41] offset:16
-; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[40:41]
+; GFX942-NEXT: global_store_dwordx4 v32, v[28:31], s[40:41] offset:112
+; GFX942-NEXT: global_store_dwordx4 v32, v[24:27], s[40:41] offset:96
+; GFX942-NEXT: global_store_dwordx4 v32, v[20:23], s[40:41] offset:80
+; GFX942-NEXT: global_store_dwordx4 v32, v[16:19], s[40:41] offset:64
+; GFX942-NEXT: global_store_dwordx4 v32, v[12:15], s[40:41] offset:48
+; GFX942-NEXT: global_store_dwordx4 v32, v[8:11], s[40:41] offset:32
+; GFX942-NEXT: global_store_dwordx4 v32, v[4:7], s[40:41] offset:16
+; GFX942-NEXT: global_store_dwordx4 v32, v[0:3], s[40:41]
; GFX942-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()