aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll1994
1 files changed, 893 insertions, 1101 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 21465be..d81ec1c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -141,20 +141,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -166,16 +164,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -183,20 +179,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd:
@@ -266,20 +260,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -291,16 +283,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -308,20 +298,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags:
@@ -1505,62 +1493,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v44, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v10, s20
-; SDAG-NEXT: v_mov_b32_e32 v11, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v12, s22
-; SDAG-NEXT: v_mov_b32_e32 v13, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1569,52 +1550,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1623,62 +1596,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v44, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v10, s20
-; HEURRC-NEXT: v_mov_b32_e32 v11, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v12, s22
-; HEURRC-NEXT: v_mov_b32_e32 v13, s23
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT: v_mov_b32_e32 v40, s20
+; HEURRC-NEXT: v_mov_b32_e32 v41, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT: v_mov_b32_e32 v42, s22
+; HEURRC-NEXT: v_mov_b32_e32 v43, s23
+; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -1687,7 +1653,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -1701,41 +1667,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0,
; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31]
-; VGPRRC-NEXT: v_mov_b32_e32 v44, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v45, s23
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd:
@@ -1869,62 +1835,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; SDAG-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v8, 0
+; SDAG-NEXT: v_mov_b32_e32 v44, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
-; SDAG-NEXT: v_mov_b32_e32 v10, s20
-; SDAG-NEXT: v_mov_b32_e32 v11, s21
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v12, s22
-; SDAG-NEXT: v_mov_b32_e32 v13, s23
-; SDAG-NEXT: v_mov_b32_e32 v0, s16
-; SDAG-NEXT: v_mov_b32_e32 v1, s17
-; SDAG-NEXT: v_mov_b32_e32 v2, s18
-; SDAG-NEXT: v_mov_b32_e32 v3, s19
-; SDAG-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; SDAG-NEXT: v_mov_b32_e32 v40, s20
+; SDAG-NEXT: v_mov_b32_e32 v41, s21
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: v_mov_b32_e32 v42, s22
+; SDAG-NEXT: v_mov_b32_e32 v43, s23
+; SDAG-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: s_nop 2
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s12
-; SDAG-NEXT: v_mov_b32_e32 v1, s13
-; SDAG-NEXT: v_mov_b32_e32 v2, s14
-; SDAG-NEXT: v_mov_b32_e32 v3, s15
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v0, s8
-; SDAG-NEXT: v_mov_b32_e32 v1, s9
-; SDAG-NEXT: v_mov_b32_e32 v2, s10
-; SDAG-NEXT: v_mov_b32_e32 v3, s11
-; SDAG-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -1933,52 +1892,44 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -1987,62 +1938,55 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v44, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
-; HEURRC-NEXT: v_mov_b32_e32 v10, s20
-; HEURRC-NEXT: v_mov_b32_e32 v11, s21
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v12, s22
-; HEURRC-NEXT: v_mov_b32_e32 v13, s23
-; HEURRC-NEXT: v_mov_b32_e32 v0, s16
-; HEURRC-NEXT: v_mov_b32_e32 v1, s17
-; HEURRC-NEXT: v_mov_b32_e32 v2, s18
-; HEURRC-NEXT: v_mov_b32_e32 v3, s19
-; HEURRC-NEXT: global_store_dwordx4 v8, v[10:13], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
+; HEURRC-NEXT: v_mov_b32_e32 v40, s20
+; HEURRC-NEXT: v_mov_b32_e32 v41, s21
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: v_mov_b32_e32 v42, s22
+; HEURRC-NEXT: v_mov_b32_e32 v43, s23
+; HEURRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: s_nop 2
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s12
-; HEURRC-NEXT: v_mov_b32_e32 v1, s13
-; HEURRC-NEXT: v_mov_b32_e32 v2, s14
-; HEURRC-NEXT: v_mov_b32_e32 v3, s15
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v0, s8
-; HEURRC-NEXT: v_mov_b32_e32 v1, s9
-; HEURRC-NEXT: v_mov_b32_e32 v2, s10
-; HEURRC-NEXT: v_mov_b32_e32 v3, s11
-; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -2051,7 +1995,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v44, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
@@ -2065,41 +2009,41 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half>
; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
-; VGPRRC-NEXT: v_mov_b32_e32 v42, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v43, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v40, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v41, s21
; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
-; VGPRRC-NEXT: v_mov_b32_e32 v44, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v45, s23
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[42:45], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: v_mov_b32_e32 v42, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v43, s23
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 2
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags:
@@ -2234,35 +2178,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2271,35 +2207,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2308,35 +2236,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac:
@@ -2443,35 +2363,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2480,35 +2392,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2517,35 +2421,27 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags:
@@ -2781,24 +2677,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v8, s0
+; SDAG-NEXT: v_mov_b32_e32 v9, s1
+; SDAG-NEXT: v_mov_b32_e32 v10, s2
+; SDAG-NEXT: v_mov_b32_e32 v11, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3]
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2810,16 +2706,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2827,24 +2721,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v2, s8
-; HEURRC-NEXT: v_mov_b32_e32 v3, s9
-; HEURRC-NEXT: v_mov_b32_e32 v4, s10
-; HEURRC-NEXT: v_mov_b32_e32 v5, s11
-; HEURRC-NEXT: v_mov_b32_e32 v6, s12
-; HEURRC-NEXT: v_mov_b32_e32 v7, s13
-; HEURRC-NEXT: v_mov_b32_e32 v8, s14
-; HEURRC-NEXT: v_mov_b32_e32 v9, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v4, s12
+; HEURRC-NEXT: v_mov_b32_e32 v5, s13
+; HEURRC-NEXT: v_mov_b32_e32 v6, s14
+; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_mov_b32_e32 v8, s0
+; HEURRC-NEXT: v_mov_b32_e32 v9, s1
+; HEURRC-NEXT: v_mov_b32_e32 v10, s2
+; HEURRC-NEXT: v_mov_b32_e32 v11, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3]
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
@@ -2852,24 +2746,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v12, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v13, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s3
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3]
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11]
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd:
; AGPR: ; %bb.0:
@@ -2930,24 +2824,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: v_mov_b32_e32 v6, s12
-; SDAG-NEXT: v_mov_b32_e32 v7, s13
-; SDAG-NEXT: v_mov_b32_e32 v8, s14
-; SDAG-NEXT: v_mov_b32_e32 v9, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, s8
+; SDAG-NEXT: v_mov_b32_e32 v1, s9
+; SDAG-NEXT: v_mov_b32_e32 v2, s10
+; SDAG-NEXT: v_mov_b32_e32 v3, s11
+; SDAG-NEXT: v_mov_b32_e32 v4, s12
+; SDAG-NEXT: v_mov_b32_e32 v5, s13
+; SDAG-NEXT: v_mov_b32_e32 v6, s14
+; SDAG-NEXT: v_mov_b32_e32 v7, s15
+; SDAG-NEXT: v_mov_b32_e32 v8, s0
+; SDAG-NEXT: v_mov_b32_e32 v9, s1
+; SDAG-NEXT: v_mov_b32_e32 v10, s2
+; SDAG-NEXT: v_mov_b32_e32 v11, s3
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; SDAG-NEXT: s_nop 7
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; SDAG-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2959,16 +2853,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s0
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s3
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
; GISEL-NEXT: s_nop 6
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -2976,24 +2868,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v2, s8
-; HEURRC-NEXT: v_mov_b32_e32 v3, s9
-; HEURRC-NEXT: v_mov_b32_e32 v4, s10
-; HEURRC-NEXT: v_mov_b32_e32 v5, s11
-; HEURRC-NEXT: v_mov_b32_e32 v6, s12
-; HEURRC-NEXT: v_mov_b32_e32 v7, s13
-; HEURRC-NEXT: v_mov_b32_e32 v8, s14
-; HEURRC-NEXT: v_mov_b32_e32 v9, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b32_e32 v0, s8
+; HEURRC-NEXT: v_mov_b32_e32 v1, s9
+; HEURRC-NEXT: v_mov_b32_e32 v2, s10
+; HEURRC-NEXT: v_mov_b32_e32 v3, s11
+; HEURRC-NEXT: v_mov_b32_e32 v4, s12
+; HEURRC-NEXT: v_mov_b32_e32 v5, s13
+; HEURRC-NEXT: v_mov_b32_e32 v6, s14
+; HEURRC-NEXT: v_mov_b32_e32 v7, s15
+; HEURRC-NEXT: v_mov_b32_e32 v8, s0
+; HEURRC-NEXT: v_mov_b32_e32 v9, s1
+; HEURRC-NEXT: v_mov_b32_e32 v10, s2
+; HEURRC-NEXT: v_mov_b32_e32 v11, s3
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[2:5], v[6:9], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
@@ -3001,24 +2893,24 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr
; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
-; VGPRRC-NEXT: v_mov_b32_e32 v4, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v12, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v6, s8
-; VGPRRC-NEXT: v_mov_b32_e32 v7, s9
-; VGPRRC-NEXT: v_mov_b32_e32 v8, s10
-; VGPRRC-NEXT: v_mov_b32_e32 v9, s11
-; VGPRRC-NEXT: v_mov_b32_e32 v10, s12
-; VGPRRC-NEXT: v_mov_b32_e32 v11, s13
-; VGPRRC-NEXT: v_mov_b32_e32 v12, s14
-; VGPRRC-NEXT: v_mov_b32_e32 v13, s15
-; VGPRRC-NEXT: v_mov_b32_e32 v0, s0
-; VGPRRC-NEXT: v_mov_b32_e32 v1, s1
-; VGPRRC-NEXT: v_mov_b32_e32 v2, s2
-; VGPRRC-NEXT: v_mov_b32_e32 v3, s3
+; VGPRRC-NEXT: v_mov_b32_e32 v0, s8
+; VGPRRC-NEXT: v_mov_b32_e32 v1, s9
+; VGPRRC-NEXT: v_mov_b32_e32 v2, s10
+; VGPRRC-NEXT: v_mov_b32_e32 v3, s11
+; VGPRRC-NEXT: v_mov_b32_e32 v4, s12
+; VGPRRC-NEXT: v_mov_b32_e32 v5, s13
+; VGPRRC-NEXT: v_mov_b32_e32 v6, s14
+; VGPRRC-NEXT: v_mov_b32_e32 v7, s15
+; VGPRRC-NEXT: v_mov_b32_e32 v8, s0
+; VGPRRC-NEXT: v_mov_b32_e32 v9, s1
+; VGPRRC-NEXT: v_mov_b32_e32 v10, s2
+; VGPRRC-NEXT: v_mov_b32_e32 v11, s3
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[6:9], v[10:13], v[0:3] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; VGPRRC-NEXT: s_nop 7
-; VGPRRC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags:
; AGPR: ; %bb.0:
@@ -4246,70 +4138,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v6, s24
-; SDAG-NEXT: v_mov_b32_e32 v7, s25
-; SDAG-NEXT: v_mov_b32_e32 v8, s26
-; SDAG-NEXT: v_mov_b32_e32 v9, s27
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31]
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s16
-; SDAG-NEXT: v_mov_b32_e32 v3, s17
-; SDAG-NEXT: v_mov_b32_e32 v4, s18
-; SDAG-NEXT: v_mov_b32_e32 v5, s19
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4318,52 +4203,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15]
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4371,70 +4248,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mov_b32_e32 v40, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v2, s20
-; HEURRC-NEXT: v_mov_b32_e32 v3, s21
-; HEURRC-NEXT: v_mov_b32_e32 v4, s22
-; HEURRC-NEXT: v_mov_b32_e32 v5, s23
+; HEURRC-NEXT: v_mov_b32_e32 v32, s20
+; HEURRC-NEXT: v_mov_b32_e32 v33, s21
+; HEURRC-NEXT: v_mov_b32_e32 v34, s22
+; HEURRC-NEXT: v_mov_b32_e32 v35, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v6, s24
-; HEURRC-NEXT: v_mov_b32_e32 v7, s25
-; HEURRC-NEXT: v_mov_b32_e32 v8, s26
-; HEURRC-NEXT: v_mov_b32_e32 v9, s27
+; HEURRC-NEXT: v_mov_b32_e32 v36, s24
+; HEURRC-NEXT: v_mov_b32_e32 v37, s25
+; HEURRC-NEXT: v_mov_b32_e32 v38, s26
+; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31]
-; HEURRC-NEXT: v_mov_b32_e32 v2, s20
-; HEURRC-NEXT: v_mov_b32_e32 v3, s21
-; HEURRC-NEXT: v_mov_b32_e32 v4, s22
-; HEURRC-NEXT: v_mov_b32_e32 v5, s23
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT: s_nop 6
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s16
-; HEURRC-NEXT: v_mov_b32_e32 v3, s17
-; HEURRC-NEXT: v_mov_b32_e32 v4, s18
-; HEURRC-NEXT: v_mov_b32_e32 v5, s19
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s12
-; HEURRC-NEXT: v_mov_b32_e32 v3, s13
-; HEURRC-NEXT: v_mov_b32_e32 v4, s14
-; HEURRC-NEXT: v_mov_b32_e32 v5, s15
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s8
-; HEURRC-NEXT: v_mov_b32_e32 v3, s9
-; HEURRC-NEXT: v_mov_b32_e32 v4, s10
-; HEURRC-NEXT: v_mov_b32_e32 v5, s11
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4442,17 +4312,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s21
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s23
+; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
@@ -4463,42 +4333,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31]
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
@@ -4645,70 +4515,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; SDAG: ; %bb.0:
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
+; SDAG-NEXT: v_mov_b32_e32 v32, s20
+; SDAG-NEXT: v_mov_b32_e32 v33, s21
+; SDAG-NEXT: v_mov_b32_e32 v34, s22
+; SDAG-NEXT: v_mov_b32_e32 v35, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v6, s24
-; SDAG-NEXT: v_mov_b32_e32 v7, s25
-; SDAG-NEXT: v_mov_b32_e32 v8, s26
-; SDAG-NEXT: v_mov_b32_e32 v9, s27
+; SDAG-NEXT: v_mov_b32_e32 v36, s24
+; SDAG-NEXT: v_mov_b32_e32 v37, s25
+; SDAG-NEXT: v_mov_b32_e32 v38, s26
+; SDAG-NEXT: v_mov_b32_e32 v39, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
-; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
+; SDAG-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3
-; SDAG-NEXT: v_mov_b32_e32 v2, s20
-; SDAG-NEXT: v_mov_b32_e32 v3, s21
-; SDAG-NEXT: v_mov_b32_e32 v4, s22
-; SDAG-NEXT: v_mov_b32_e32 v5, s23
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT: s_nop 6
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s16
-; SDAG-NEXT: v_mov_b32_e32 v3, s17
-; SDAG-NEXT: v_mov_b32_e32 v4, s18
-; SDAG-NEXT: v_mov_b32_e32 v5, s19
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s16
+; SDAG-NEXT: v_mov_b32_e32 v17, s17
+; SDAG-NEXT: v_mov_b32_e32 v18, s18
+; SDAG-NEXT: v_mov_b32_e32 v19, s19
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s12
-; SDAG-NEXT: v_mov_b32_e32 v3, s13
-; SDAG-NEXT: v_mov_b32_e32 v4, s14
-; SDAG-NEXT: v_mov_b32_e32 v5, s15
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s12
+; SDAG-NEXT: v_mov_b32_e32 v17, s13
+; SDAG-NEXT: v_mov_b32_e32 v18, s14
+; SDAG-NEXT: v_mov_b32_e32 v19, s15
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: v_mov_b32_e32 v2, s8
-; SDAG-NEXT: v_mov_b32_e32 v3, s9
-; SDAG-NEXT: v_mov_b32_e32 v4, s10
-; SDAG-NEXT: v_mov_b32_e32 v5, s11
-; SDAG-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
+; SDAG-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: s_endpgm
;
@@ -4717,52 +4580,44 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; GISEL-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; GISEL-NEXT: v_mov_b32_e32 v24, 0
+; GISEL-NEXT: v_mov_b32_e32 v56, 0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
-; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:1 abid:2 blgp:3
-; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[22:23]
-; GISEL-NEXT: global_store_dwordx4 v24, v[8:11], s[0:1] sc0 sc1
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[42:43], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[8:9]
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:1 abid:2 blgp:3
+; GISEL-NEXT: v_mov_b64_e32 v[46:47], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[44:45], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: global_store_dwordx4 v56, v[40:43], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[12:15], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[44:47], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[16:19], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[48:51], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, v[20:23], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[52:55], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[16:19], s[0:1] sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[16:19], s[0:1] sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[20:23], s[0:1] offset:16 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[20:23], s[0:1] offset:16 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[24:27], s[0:1] offset:32 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[24:27], s[0:1] offset:32 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1
+; GISEL-NEXT: global_store_dwordx4 v56, v[28:31], s[0:1] offset:48 sc0 sc1
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: s_endpgm
;
@@ -4770,70 +4625,63 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mov_b32_e32 v40, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v2, s20
-; HEURRC-NEXT: v_mov_b32_e32 v3, s21
-; HEURRC-NEXT: v_mov_b32_e32 v4, s22
-; HEURRC-NEXT: v_mov_b32_e32 v5, s23
+; HEURRC-NEXT: v_mov_b32_e32 v32, s20
+; HEURRC-NEXT: v_mov_b32_e32 v33, s21
+; HEURRC-NEXT: v_mov_b32_e32 v34, s22
+; HEURRC-NEXT: v_mov_b32_e32 v35, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v6, s24
-; HEURRC-NEXT: v_mov_b32_e32 v7, s25
-; HEURRC-NEXT: v_mov_b32_e32 v8, s26
-; HEURRC-NEXT: v_mov_b32_e32 v9, s27
+; HEURRC-NEXT: v_mov_b32_e32 v36, s24
+; HEURRC-NEXT: v_mov_b32_e32 v37, s25
+; HEURRC-NEXT: v_mov_b32_e32 v38, s26
+; HEURRC-NEXT: v_mov_b32_e32 v39, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8
+; HEURRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[2:5], v[6:9], a[16:31] cbsz:1 abid:2 blgp:3
-; HEURRC-NEXT: v_mov_b32_e32 v2, s20
-; HEURRC-NEXT: v_mov_b32_e32 v3, s21
-; HEURRC-NEXT: v_mov_b32_e32 v4, s22
-; HEURRC-NEXT: v_mov_b32_e32 v5, s23
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT: s_nop 6
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s16
-; HEURRC-NEXT: v_mov_b32_e32 v3, s17
-; HEURRC-NEXT: v_mov_b32_e32 v4, s18
-; HEURRC-NEXT: v_mov_b32_e32 v5, s19
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s16
+; HEURRC-NEXT: v_mov_b32_e32 v17, s17
+; HEURRC-NEXT: v_mov_b32_e32 v18, s18
+; HEURRC-NEXT: v_mov_b32_e32 v19, s19
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s12
-; HEURRC-NEXT: v_mov_b32_e32 v3, s13
-; HEURRC-NEXT: v_mov_b32_e32 v4, s14
-; HEURRC-NEXT: v_mov_b32_e32 v5, s15
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s12
+; HEURRC-NEXT: v_mov_b32_e32 v17, s13
+; HEURRC-NEXT: v_mov_b32_e32 v18, s14
+; HEURRC-NEXT: v_mov_b32_e32 v19, s15
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_nop 0
-; HEURRC-NEXT: v_mov_b32_e32 v2, s8
-; HEURRC-NEXT: v_mov_b32_e32 v3, s9
-; HEURRC-NEXT: v_mov_b32_e32 v4, s10
-; HEURRC-NEXT: v_mov_b32_e32 v5, s11
-; HEURRC-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] sc0 sc1
+; HEURRC-NEXT: v_mov_b32_e32 v16, s8
+; HEURRC-NEXT: v_mov_b32_e32 v17, s9
+; HEURRC-NEXT: v_mov_b32_e32 v18, s10
+; HEURRC-NEXT: v_mov_b32_e32 v19, s11
+; HEURRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 sc0 sc1
+; HEURRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; HEURRC-NEXT: s_waitcnt vmcnt(0)
; HEURRC-NEXT: s_endpgm
;
@@ -4841,17 +4689,17 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
-; VGPRRC-NEXT: v_mov_b32_e32 v32, 0
+; VGPRRC-NEXT: v_mov_b32_e32 v40, 0
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT: v_mov_b32_e32 v34, s20
-; VGPRRC-NEXT: v_mov_b32_e32 v35, s21
-; VGPRRC-NEXT: v_mov_b32_e32 v36, s22
-; VGPRRC-NEXT: v_mov_b32_e32 v37, s23
+; VGPRRC-NEXT: v_mov_b32_e32 v32, s20
+; VGPRRC-NEXT: v_mov_b32_e32 v33, s21
+; VGPRRC-NEXT: v_mov_b32_e32 v34, s22
+; VGPRRC-NEXT: v_mov_b32_e32 v35, s23
; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; VGPRRC-NEXT: v_mov_b32_e32 v38, s24
-; VGPRRC-NEXT: v_mov_b32_e32 v39, s25
-; VGPRRC-NEXT: v_mov_b32_e32 v40, s26
-; VGPRRC-NEXT: v_mov_b32_e32 v41, s27
+; VGPRRC-NEXT: v_mov_b32_e32 v36, s24
+; VGPRRC-NEXT: v_mov_b32_e32 v37, s25
+; VGPRRC-NEXT: v_mov_b32_e32 v38, s26
+; VGPRRC-NEXT: v_mov_b32_e32 v39, s27
; VGPRRC-NEXT: s_waitcnt lgkmcnt(0)
; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23]
; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21]
@@ -4862,42 +4710,42 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
; VGPRRC-NEXT: s_nop 1
-; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[34:37], v[38:41], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
; VGPRRC-NEXT: s_nop 6
; VGPRRC-NEXT: v_mov_b32_e32 v16, s20
; VGPRRC-NEXT: v_mov_b32_e32 v17, s21
; VGPRRC-NEXT: v_mov_b32_e32 v18, s22
; VGPRRC-NEXT: v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s16
; VGPRRC-NEXT: v_mov_b32_e32 v17, s17
; VGPRRC-NEXT: v_mov_b32_e32 v18, s18
; VGPRRC-NEXT: v_mov_b32_e32 v19, s19
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s12
; VGPRRC-NEXT: v_mov_b32_e32 v17, s13
; VGPRRC-NEXT: v_mov_b32_e32 v18, s14
; VGPRRC-NEXT: v_mov_b32_e32 v19, s15
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_nop 0
; VGPRRC-NEXT: v_mov_b32_e32 v16, s8
; VGPRRC-NEXT: v_mov_b32_e32 v17, s9
; VGPRRC-NEXT: v_mov_b32_e32 v18, s10
; VGPRRC-NEXT: v_mov_b32_e32 v19, s11
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
-; VGPRRC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 sc0 sc1
+; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1
; VGPRRC-NEXT: s_waitcnt vmcnt(0)
; VGPRRC-NEXT: s_endpgm
; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
@@ -5045,41 +4893,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5088,35 +4928,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5124,41 +4956,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v20, s24
+; HEURRC-NEXT: v_mov_b32_e32 v21, s25
+; HEURRC-NEXT: v_mov_b32_e32 v22, s26
+; HEURRC-NEXT: v_mov_b32_e32 v23, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15]
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
@@ -5279,41 +5103,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; SDAG-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; SDAG-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v0, s20
-; SDAG-NEXT: v_mov_b32_e32 v1, s21
-; SDAG-NEXT: v_mov_b32_e32 v2, s22
-; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
; SDAG-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; SDAG-NEXT: v_mov_b32_e32 v4, s24
-; SDAG-NEXT: v_mov_b32_e32 v5, s25
-; SDAG-NEXT: v_mov_b32_e32 v6, s26
-; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
-; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
-; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
-; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
-; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
-; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
-; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
-; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
-; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
-; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
-; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
-; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
-; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
-; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
-; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; SDAG-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; SDAG-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 2
-; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; SDAG-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5322,35 +5138,27 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; GISEL-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[28:29]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
-; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[30:31]
-; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
-; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
-; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
-; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
-; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
-; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
-; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
-; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
-; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
-; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
-; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
-; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
-; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
-; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
-; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
+; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; GISEL-NEXT: v_mov_b32_e32 v16, 0
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 2
-; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
-; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
+; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GISEL-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5358,41 +5166,33 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24
; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_mov_b32_e32 v0, s20
-; HEURRC-NEXT: v_mov_b32_e32 v1, s21
-; HEURRC-NEXT: v_mov_b32_e32 v2, s22
-; HEURRC-NEXT: v_mov_b32_e32 v3, s23
+; HEURRC-NEXT: v_mov_b32_e32 v16, s20
+; HEURRC-NEXT: v_mov_b32_e32 v17, s21
+; HEURRC-NEXT: v_mov_b32_e32 v18, s22
+; HEURRC-NEXT: v_mov_b32_e32 v19, s23
; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64
-; HEURRC-NEXT: v_mov_b32_e32 v4, s24
-; HEURRC-NEXT: v_mov_b32_e32 v5, s25
-; HEURRC-NEXT: v_mov_b32_e32 v6, s26
-; HEURRC-NEXT: v_mov_b32_e32 v7, s27
+; HEURRC-NEXT: v_mov_b32_e32 v20, s24
+; HEURRC-NEXT: v_mov_b32_e32 v21, s25
+; HEURRC-NEXT: v_mov_b32_e32 v22, s26
+; HEURRC-NEXT: v_mov_b32_e32 v23, s27
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11
-; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12
-; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13
-; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14
-; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15
-; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16
-; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17
-; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18
-; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19
-; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20
-; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21
-; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22
-; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23
+; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
+; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
+; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17]
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19]
+; HEURRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21]
+; HEURRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1
-; HEURRC-NEXT: v_mov_b32_e32 v0, 0
+; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mov_b32_e32 v16, 0
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: s_nop 2
-; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
-; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
-; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
-; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; HEURRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; HEURRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; HEURRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; HEURRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
@@ -5643,20 +5443,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5664,20 +5462,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
@@ -5747,20 +5543,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT: v_mov_b32_e32 v8, 0
+; GCN-NEXT: v_mov_b32_e32 v12, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; GCN-NEXT: v_accvgpr_write_b32 a0, s0
+; GCN-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; GCN-NEXT: v_accvgpr_write_b32 a1, s1
-; GCN-NEXT: v_accvgpr_write_b32 a2, s2
-; GCN-NEXT: v_accvgpr_write_b32 a3, s3
+; GCN-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; GCN-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; GCN-NEXT: s_nop 7
-; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; GCN-NEXT: s_endpgm
;
; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5768,20 +5562,18 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
-; HEURRC-NEXT: v_mov_b32_e32 v8, 0
+; HEURRC-NEXT: v_mov_b32_e32 v12, 0
; HEURRC-NEXT: s_waitcnt lgkmcnt(0)
; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11]
; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13]
-; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0
+; HEURRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3]
; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15]
-; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1
-; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2
-; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3
+; HEURRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
; HEURRC-NEXT: s_nop 1
-; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1
; HEURRC-NEXT: s_nop 7
-; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7]
+; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7]
; HEURRC-NEXT: s_endpgm
;
; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
@@ -5845,5 +5637,5 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt
ret void
}
-attributes #0 = { "amdgpu-flat-work-group-size"="512,512" }
+attributes #0 = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-agpr-alloc"="0,0" }
attributes #1 = { "amdgpu-flat-work-group-size"="1,64" }