aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/mfma-loop.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-loop.ll2828
1 files changed, 2017 insertions, 811 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 4bb6538..8b6bb9b 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
; Check that we do not copy agprs to vgprs and back inside the loop.
@@ -101,53 +101,117 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_zeroinit:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v11, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v13, 0
+; GFX90A-NEXT: v_mov_b32_e32 v14, 0
+; GFX90A-NEXT: v_mov_b32_e32 v15, 0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0
+; GFX90A-NEXT: v_mov_b32_e32 v20, 0
+; GFX90A-NEXT: v_mov_b32_e32 v21, 0
+; GFX90A-NEXT: v_mov_b32_e32 v22, 0
+; GFX90A-NEXT: v_mov_b32_e32 v23, 0
+; GFX90A-NEXT: v_mov_b32_e32 v24, 0
+; GFX90A-NEXT: v_mov_b32_e32 v25, 0
+; GFX90A-NEXT: v_mov_b32_e32 v26, 0
+; GFX90A-NEXT: v_mov_b32_e32 v27, 0
+; GFX90A-NEXT: v_mov_b32_e32 v28, 0
+; GFX90A-NEXT: v_mov_b32_e32 v29, 0
+; GFX90A-NEXT: v_mov_b32_e32 v30, 0
+; GFX90A-NEXT: v_mov_b32_e32 v31, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -160,53 +224,117 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_zeroinit:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v11, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v13, 0
+; GFX942-NEXT: v_mov_b32_e32 v14, 0
+; GFX942-NEXT: v_mov_b32_e32 v15, 0
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
+; GFX942-NEXT: v_mov_b32_e32 v19, 0
+; GFX942-NEXT: v_mov_b32_e32 v20, 0
+; GFX942-NEXT: v_mov_b32_e32 v21, 0
+; GFX942-NEXT: v_mov_b32_e32 v22, 0
+; GFX942-NEXT: v_mov_b32_e32 v23, 0
+; GFX942-NEXT: v_mov_b32_e32 v24, 0
+; GFX942-NEXT: v_mov_b32_e32 v25, 0
+; GFX942-NEXT: v_mov_b32_e32 v26, 0
+; GFX942-NEXT: v_mov_b32_e32 v27, 0
+; GFX942-NEXT: v_mov_b32_e32 v28, 0
+; GFX942-NEXT: v_mov_b32_e32 v29, 0
+; GFX942-NEXT: v_mov_b32_e32 v30, 0
+; GFX942-NEXT: v_mov_b32_e32 v31, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB0_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -332,54 +460,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
;
; GFX90A-LABEL: test_mfma_loop_unfoldable_splat:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_mov_b32_e32 v31, 0x42f60000
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v31
+; GFX90A-NEXT: v_mov_b32_e32 v1, v31
+; GFX90A-NEXT: v_mov_b32_e32 v2, v31
+; GFX90A-NEXT: v_mov_b32_e32 v3, v31
+; GFX90A-NEXT: v_mov_b32_e32 v4, v31
+; GFX90A-NEXT: v_mov_b32_e32 v5, v31
+; GFX90A-NEXT: v_mov_b32_e32 v6, v31
+; GFX90A-NEXT: v_mov_b32_e32 v7, v31
+; GFX90A-NEXT: v_mov_b32_e32 v8, v31
+; GFX90A-NEXT: v_mov_b32_e32 v9, v31
+; GFX90A-NEXT: v_mov_b32_e32 v10, v31
+; GFX90A-NEXT: v_mov_b32_e32 v11, v31
+; GFX90A-NEXT: v_mov_b32_e32 v12, v31
+; GFX90A-NEXT: v_mov_b32_e32 v13, v31
+; GFX90A-NEXT: v_mov_b32_e32 v14, v31
+; GFX90A-NEXT: v_mov_b32_e32 v15, v31
+; GFX90A-NEXT: v_mov_b32_e32 v16, v31
+; GFX90A-NEXT: v_mov_b32_e32 v17, v31
+; GFX90A-NEXT: v_mov_b32_e32 v18, v31
+; GFX90A-NEXT: v_mov_b32_e32 v19, v31
+; GFX90A-NEXT: v_mov_b32_e32 v20, v31
+; GFX90A-NEXT: v_mov_b32_e32 v21, v31
+; GFX90A-NEXT: v_mov_b32_e32 v22, v31
+; GFX90A-NEXT: v_mov_b32_e32 v23, v31
+; GFX90A-NEXT: v_mov_b32_e32 v24, v31
+; GFX90A-NEXT: v_mov_b32_e32 v25, v31
+; GFX90A-NEXT: v_mov_b32_e32 v26, v31
+; GFX90A-NEXT: v_mov_b32_e32 v27, v31
+; GFX90A-NEXT: v_mov_b32_e32 v28, v31
+; GFX90A-NEXT: v_mov_b32_e32 v29, v31
+; GFX90A-NEXT: v_mov_b32_e32 v30, v31
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -392,54 +583,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg
;
; GFX942-LABEL: test_mfma_loop_unfoldable_splat:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_mov_b32_e32 v31, 0x42f60000
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, v31
+; GFX942-NEXT: v_mov_b32_e32 v1, v31
+; GFX942-NEXT: v_mov_b32_e32 v2, v31
+; GFX942-NEXT: v_mov_b32_e32 v3, v31
+; GFX942-NEXT: v_mov_b32_e32 v4, v31
+; GFX942-NEXT: v_mov_b32_e32 v5, v31
+; GFX942-NEXT: v_mov_b32_e32 v6, v31
+; GFX942-NEXT: v_mov_b32_e32 v7, v31
+; GFX942-NEXT: v_mov_b32_e32 v8, v31
+; GFX942-NEXT: v_mov_b32_e32 v9, v31
+; GFX942-NEXT: v_mov_b32_e32 v10, v31
+; GFX942-NEXT: v_mov_b32_e32 v11, v31
+; GFX942-NEXT: v_mov_b32_e32 v12, v31
+; GFX942-NEXT: v_mov_b32_e32 v13, v31
+; GFX942-NEXT: v_mov_b32_e32 v14, v31
+; GFX942-NEXT: v_mov_b32_e32 v15, v31
+; GFX942-NEXT: v_mov_b32_e32 v16, v31
+; GFX942-NEXT: v_mov_b32_e32 v17, v31
+; GFX942-NEXT: v_mov_b32_e32 v18, v31
+; GFX942-NEXT: v_mov_b32_e32 v19, v31
+; GFX942-NEXT: v_mov_b32_e32 v20, v31
+; GFX942-NEXT: v_mov_b32_e32 v21, v31
+; GFX942-NEXT: v_mov_b32_e32 v22, v31
+; GFX942-NEXT: v_mov_b32_e32 v23, v31
+; GFX942-NEXT: v_mov_b32_e32 v24, v31
+; GFX942-NEXT: v_mov_b32_e32 v25, v31
+; GFX942-NEXT: v_mov_b32_e32 v26, v31
+; GFX942-NEXT: v_mov_b32_e32 v27, v31
+; GFX942-NEXT: v_mov_b32_e32 v28, v31
+; GFX942-NEXT: v_mov_b32_e32 v29, v31
+; GFX942-NEXT: v_mov_b32_e32 v30, v31
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB1_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -559,53 +813,117 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_non_splat:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v11, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v13, 0
+; GFX90A-NEXT: v_mov_b32_e32 v14, 0
+; GFX90A-NEXT: v_mov_b32_e32 v15, 0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0
+; GFX90A-NEXT: v_mov_b32_e32 v20, 0
+; GFX90A-NEXT: v_mov_b32_e32 v21, 0
+; GFX90A-NEXT: v_mov_b32_e32 v22, 0
+; GFX90A-NEXT: v_mov_b32_e32 v23, 0
+; GFX90A-NEXT: v_mov_b32_e32 v24, 0
+; GFX90A-NEXT: v_mov_b32_e32 v25, 0
+; GFX90A-NEXT: v_mov_b32_e32 v26, 0
+; GFX90A-NEXT: v_mov_b32_e32 v27, 0
+; GFX90A-NEXT: v_mov_b32_e32 v28, 0
+; GFX90A-NEXT: v_mov_b32_e32 v29, 0
+; GFX90A-NEXT: v_mov_b32_e32 v30, 0
+; GFX90A-NEXT: v_mov_b32_e32 v31, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -618,53 +936,117 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_non_splat:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v11, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v13, 0
+; GFX942-NEXT: v_mov_b32_e32 v14, 0
+; GFX942-NEXT: v_mov_b32_e32 v15, 0
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
+; GFX942-NEXT: v_mov_b32_e32 v19, 0
+; GFX942-NEXT: v_mov_b32_e32 v20, 0
+; GFX942-NEXT: v_mov_b32_e32 v21, 0
+; GFX942-NEXT: v_mov_b32_e32 v22, 0
+; GFX942-NEXT: v_mov_b32_e32 v23, 0
+; GFX942-NEXT: v_mov_b32_e32 v24, 0
+; GFX942-NEXT: v_mov_b32_e32 v25, 0
+; GFX942-NEXT: v_mov_b32_e32 v26, 0
+; GFX942-NEXT: v_mov_b32_e32 v27, 0
+; GFX942-NEXT: v_mov_b32_e32 v28, 0
+; GFX942-NEXT: v_mov_b32_e32 v29, 0
+; GFX942-NEXT: v_mov_b32_e32 v30, 0
+; GFX942-NEXT: v_mov_b32_e32 v31, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB2_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -821,85 +1203,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
;
; GFX90A-LABEL: test_mfma_loop_unfoldable_seq:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f80000
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x42fa0000
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x42fc0000
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0x42fe0000
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0x43000000
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0x43010000
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0x43020000
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0x43030000
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0x43040000
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0x43050000
+; GFX90A-NEXT: v_mov_b32_e32 v11, 0x43060000
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0x43070000
+; GFX90A-NEXT: v_mov_b32_e32 v13, 0x43080000
+; GFX90A-NEXT: v_mov_b32_e32 v14, 0x43090000
+; GFX90A-NEXT: v_mov_b32_e32 v15, 0x430a0000
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0x430b0000
+; GFX90A-NEXT: v_mov_b32_e32 v17, 0x430c0000
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0x430d0000
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0x430e0000
+; GFX90A-NEXT: v_mov_b32_e32 v20, 0x430f0000
+; GFX90A-NEXT: v_mov_b32_e32 v21, 0x43100000
+; GFX90A-NEXT: v_mov_b32_e32 v22, 0x43110000
+; GFX90A-NEXT: v_mov_b32_e32 v23, 0x43120000
+; GFX90A-NEXT: v_mov_b32_e32 v24, 0x43130000
+; GFX90A-NEXT: v_mov_b32_e32 v25, 0x43140000
+; GFX90A-NEXT: v_mov_b32_e32 v26, 0x43150000
+; GFX90A-NEXT: v_mov_b32_e32 v27, 0x43160000
+; GFX90A-NEXT: v_mov_b32_e32 v28, 0x43170000
+; GFX90A-NEXT: v_mov_b32_e32 v29, 0x43180000
+; GFX90A-NEXT: v_mov_b32_e32 v30, 0x43190000
+; GFX90A-NEXT: v_mov_b32_e32 v31, 0x431a0000
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -912,85 +1326,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg)
;
; GFX942-LABEL: test_mfma_loop_unfoldable_seq:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f80000
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x42fa0000
+; GFX942-NEXT: v_mov_b32_e32 v3, 0x42fc0000
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x42fe0000
+; GFX942-NEXT: v_mov_b32_e32 v5, 0x43000000
+; GFX942-NEXT: v_mov_b32_e32 v6, 0x43010000
+; GFX942-NEXT: v_mov_b32_e32 v7, 0x43020000
+; GFX942-NEXT: v_mov_b32_e32 v8, 0x43030000
+; GFX942-NEXT: v_mov_b32_e32 v9, 0x43040000
+; GFX942-NEXT: v_mov_b32_e32 v10, 0x43050000
+; GFX942-NEXT: v_mov_b32_e32 v11, 0x43060000
+; GFX942-NEXT: v_mov_b32_e32 v12, 0x43070000
+; GFX942-NEXT: v_mov_b32_e32 v13, 0x43080000
+; GFX942-NEXT: v_mov_b32_e32 v14, 0x43090000
+; GFX942-NEXT: v_mov_b32_e32 v15, 0x430a0000
+; GFX942-NEXT: v_mov_b32_e32 v16, 0x430b0000
+; GFX942-NEXT: v_mov_b32_e32 v17, 0x430c0000
+; GFX942-NEXT: v_mov_b32_e32 v18, 0x430d0000
+; GFX942-NEXT: v_mov_b32_e32 v19, 0x430e0000
+; GFX942-NEXT: v_mov_b32_e32 v20, 0x430f0000
+; GFX942-NEXT: v_mov_b32_e32 v21, 0x43100000
+; GFX942-NEXT: v_mov_b32_e32 v22, 0x43110000
+; GFX942-NEXT: v_mov_b32_e32 v23, 0x43120000
+; GFX942-NEXT: v_mov_b32_e32 v24, 0x43130000
+; GFX942-NEXT: v_mov_b32_e32 v25, 0x43140000
+; GFX942-NEXT: v_mov_b32_e32 v26, 0x43150000
+; GFX942-NEXT: v_mov_b32_e32 v27, 0x43160000
+; GFX942-NEXT: v_mov_b32_e32 v28, 0x43170000
+; GFX942-NEXT: v_mov_b32_e32 v29, 0x43180000
+; GFX942-NEXT: v_mov_b32_e32 v30, 0x43190000
+; GFX942-NEXT: v_mov_b32_e32 v31, 0x431a0000
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB3_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1110,54 +1556,117 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
;
; GFX90A-LABEL: test_mfma_loop_vgpr_init:
; GFX90A: ; %bb.0: ; %entry
-; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_and_b32_e32 v31, 0x3ff, v0
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, v31
+; GFX90A-NEXT: v_mov_b32_e32 v1, v31
+; GFX90A-NEXT: v_mov_b32_e32 v2, v31
+; GFX90A-NEXT: v_mov_b32_e32 v3, v31
+; GFX90A-NEXT: v_mov_b32_e32 v4, v31
+; GFX90A-NEXT: v_mov_b32_e32 v5, v31
+; GFX90A-NEXT: v_mov_b32_e32 v6, v31
+; GFX90A-NEXT: v_mov_b32_e32 v7, v31
+; GFX90A-NEXT: v_mov_b32_e32 v8, v31
+; GFX90A-NEXT: v_mov_b32_e32 v9, v31
+; GFX90A-NEXT: v_mov_b32_e32 v10, v31
+; GFX90A-NEXT: v_mov_b32_e32 v11, v31
+; GFX90A-NEXT: v_mov_b32_e32 v12, v31
+; GFX90A-NEXT: v_mov_b32_e32 v13, v31
+; GFX90A-NEXT: v_mov_b32_e32 v14, v31
+; GFX90A-NEXT: v_mov_b32_e32 v15, v31
+; GFX90A-NEXT: v_mov_b32_e32 v16, v31
+; GFX90A-NEXT: v_mov_b32_e32 v17, v31
+; GFX90A-NEXT: v_mov_b32_e32 v18, v31
+; GFX90A-NEXT: v_mov_b32_e32 v19, v31
+; GFX90A-NEXT: v_mov_b32_e32 v20, v31
+; GFX90A-NEXT: v_mov_b32_e32 v21, v31
+; GFX90A-NEXT: v_mov_b32_e32 v22, v31
+; GFX90A-NEXT: v_mov_b32_e32 v23, v31
+; GFX90A-NEXT: v_mov_b32_e32 v24, v31
+; GFX90A-NEXT: v_mov_b32_e32 v25, v31
+; GFX90A-NEXT: v_mov_b32_e32 v26, v31
+; GFX90A-NEXT: v_mov_b32_e32 v27, v31
+; GFX90A-NEXT: v_mov_b32_e32 v28, v31
+; GFX90A-NEXT: v_mov_b32_e32 v29, v31
+; GFX90A-NEXT: v_mov_b32_e32 v30, v31
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1170,54 +1679,117 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
;
; GFX942-LABEL: test_mfma_loop_vgpr_init:
; GFX942: ; %bb.0: ; %entry
-; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_and_b32_e32 v31, 0x3ff, v0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, v31
+; GFX942-NEXT: v_mov_b32_e32 v1, v31
+; GFX942-NEXT: v_mov_b32_e32 v2, v31
+; GFX942-NEXT: v_mov_b32_e32 v3, v31
+; GFX942-NEXT: v_mov_b32_e32 v4, v31
+; GFX942-NEXT: v_mov_b32_e32 v5, v31
+; GFX942-NEXT: v_mov_b32_e32 v6, v31
+; GFX942-NEXT: v_mov_b32_e32 v7, v31
+; GFX942-NEXT: v_mov_b32_e32 v8, v31
+; GFX942-NEXT: v_mov_b32_e32 v9, v31
+; GFX942-NEXT: v_mov_b32_e32 v10, v31
+; GFX942-NEXT: v_mov_b32_e32 v11, v31
+; GFX942-NEXT: v_mov_b32_e32 v12, v31
+; GFX942-NEXT: v_mov_b32_e32 v13, v31
+; GFX942-NEXT: v_mov_b32_e32 v14, v31
+; GFX942-NEXT: v_mov_b32_e32 v15, v31
+; GFX942-NEXT: v_mov_b32_e32 v16, v31
+; GFX942-NEXT: v_mov_b32_e32 v17, v31
+; GFX942-NEXT: v_mov_b32_e32 v18, v31
+; GFX942-NEXT: v_mov_b32_e32 v19, v31
+; GFX942-NEXT: v_mov_b32_e32 v20, v31
+; GFX942-NEXT: v_mov_b32_e32 v21, v31
+; GFX942-NEXT: v_mov_b32_e32 v22, v31
+; GFX942-NEXT: v_mov_b32_e32 v23, v31
+; GFX942-NEXT: v_mov_b32_e32 v24, v31
+; GFX942-NEXT: v_mov_b32_e32 v25, v31
+; GFX942-NEXT: v_mov_b32_e32 v26, v31
+; GFX942-NEXT: v_mov_b32_e32 v27, v31
+; GFX942-NEXT: v_mov_b32_e32 v28, v31
+; GFX942-NEXT: v_mov_b32_e32 v29, v31
+; GFX942-NEXT: v_mov_b32_e32 v30, v31
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB4_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1378,54 +1950,117 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v31, s1
; GFX90A-NEXT: v_mov_b32_e32 v0, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v2, s1
+; GFX90A-NEXT: v_mov_b32_e32 v3, s1
+; GFX90A-NEXT: v_mov_b32_e32 v4, s1
+; GFX90A-NEXT: v_mov_b32_e32 v5, s1
+; GFX90A-NEXT: v_mov_b32_e32 v6, s1
+; GFX90A-NEXT: v_mov_b32_e32 v7, s1
+; GFX90A-NEXT: v_mov_b32_e32 v8, s1
+; GFX90A-NEXT: v_mov_b32_e32 v9, s1
+; GFX90A-NEXT: v_mov_b32_e32 v10, s1
+; GFX90A-NEXT: v_mov_b32_e32 v11, s1
+; GFX90A-NEXT: v_mov_b32_e32 v12, s1
+; GFX90A-NEXT: v_mov_b32_e32 v13, s1
+; GFX90A-NEXT: v_mov_b32_e32 v14, s1
+; GFX90A-NEXT: v_mov_b32_e32 v15, s1
+; GFX90A-NEXT: v_mov_b32_e32 v16, s1
+; GFX90A-NEXT: v_mov_b32_e32 v17, s1
+; GFX90A-NEXT: v_mov_b32_e32 v18, s1
+; GFX90A-NEXT: v_mov_b32_e32 v19, s1
+; GFX90A-NEXT: v_mov_b32_e32 v20, s1
+; GFX90A-NEXT: v_mov_b32_e32 v21, s1
+; GFX90A-NEXT: v_mov_b32_e32 v22, s1
+; GFX90A-NEXT: v_mov_b32_e32 v23, s1
+; GFX90A-NEXT: v_mov_b32_e32 v24, s1
+; GFX90A-NEXT: v_mov_b32_e32 v25, s1
+; GFX90A-NEXT: v_mov_b32_e32 v26, s1
+; GFX90A-NEXT: v_mov_b32_e32 v27, s1
+; GFX90A-NEXT: v_mov_b32_e32 v28, s1
+; GFX90A-NEXT: v_mov_b32_e32 v29, s1
+; GFX90A-NEXT: v_mov_b32_e32 v30, s1
; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1440,54 +2075,117 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v31, s1
; GFX942-NEXT: v_mov_b32_e32 v0, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NEXT: v_mov_b32_e32 v2, s1
+; GFX942-NEXT: v_mov_b32_e32 v3, s1
+; GFX942-NEXT: v_mov_b32_e32 v4, s1
+; GFX942-NEXT: v_mov_b32_e32 v5, s1
+; GFX942-NEXT: v_mov_b32_e32 v6, s1
+; GFX942-NEXT: v_mov_b32_e32 v7, s1
+; GFX942-NEXT: v_mov_b32_e32 v8, s1
+; GFX942-NEXT: v_mov_b32_e32 v9, s1
+; GFX942-NEXT: v_mov_b32_e32 v10, s1
+; GFX942-NEXT: v_mov_b32_e32 v11, s1
+; GFX942-NEXT: v_mov_b32_e32 v12, s1
+; GFX942-NEXT: v_mov_b32_e32 v13, s1
+; GFX942-NEXT: v_mov_b32_e32 v14, s1
+; GFX942-NEXT: v_mov_b32_e32 v15, s1
+; GFX942-NEXT: v_mov_b32_e32 v16, s1
+; GFX942-NEXT: v_mov_b32_e32 v17, s1
+; GFX942-NEXT: v_mov_b32_e32 v18, s1
+; GFX942-NEXT: v_mov_b32_e32 v19, s1
+; GFX942-NEXT: v_mov_b32_e32 v20, s1
+; GFX942-NEXT: v_mov_b32_e32 v21, s1
+; GFX942-NEXT: v_mov_b32_e32 v22, s1
+; GFX942-NEXT: v_mov_b32_e32 v23, s1
+; GFX942-NEXT: v_mov_b32_e32 v24, s1
+; GFX942-NEXT: v_mov_b32_e32 v25, s1
+; GFX942-NEXT: v_mov_b32_e32 v26, s1
+; GFX942-NEXT: v_mov_b32_e32 v27, s1
+; GFX942-NEXT: v_mov_b32_e32 v28, s1
+; GFX942-NEXT: v_mov_b32_e32 v29, s1
+; GFX942-NEXT: v_mov_b32_e32 v30, s1
; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB5_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1644,56 +2342,118 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX90A-LABEL: test_mfma_loop_mixed_init:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s1
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: s_mov_b32 s0, 16
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v11, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v13, 0
+; GFX90A-NEXT: v_mov_b32_e32 v14, 0
+; GFX90A-NEXT: v_mov_b32_e32 v15, 0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0
+; GFX90A-NEXT: v_mov_b32_e32 v20, 0
+; GFX90A-NEXT: v_mov_b32_e32 v21, 0
+; GFX90A-NEXT: v_mov_b32_e32 v22, 0
+; GFX90A-NEXT: v_mov_b32_e32 v23, 0
+; GFX90A-NEXT: v_mov_b32_e32 v24, 0
+; GFX90A-NEXT: v_mov_b32_e32 v25, 0
+; GFX90A-NEXT: v_mov_b32_e32 v26, 0
+; GFX90A-NEXT: v_mov_b32_e32 v27, 0
+; GFX90A-NEXT: v_mov_b32_e32 v28, 0
+; GFX90A-NEXT: v_mov_b32_e32 v29, 0
+; GFX90A-NEXT: v_mov_b32_e32 v30, 0
+; GFX90A-NEXT: v_mov_b32_e32 v31, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -1707,56 +2467,118 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa
; GFX942-LABEL: test_mfma_loop_mixed_init:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: v_mov_b32_e32 v0, s1
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, s1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v11, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v13, 0
+; GFX942-NEXT: v_mov_b32_e32 v14, 0
+; GFX942-NEXT: v_mov_b32_e32 v15, 0
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
+; GFX942-NEXT: v_mov_b32_e32 v19, 0
+; GFX942-NEXT: v_mov_b32_e32 v20, 0
+; GFX942-NEXT: v_mov_b32_e32 v21, 0
+; GFX942-NEXT: v_mov_b32_e32 v22, 0
+; GFX942-NEXT: v_mov_b32_e32 v23, 0
+; GFX942-NEXT: v_mov_b32_e32 v24, 0
+; GFX942-NEXT: v_mov_b32_e32 v25, 0
+; GFX942-NEXT: v_mov_b32_e32 v26, 0
+; GFX942-NEXT: v_mov_b32_e32 v27, 0
+; GFX942-NEXT: v_mov_b32_e32 v28, 0
+; GFX942-NEXT: v_mov_b32_e32 v29, 0
+; GFX942-NEXT: v_mov_b32_e32 v30, 0
+; GFX942-NEXT: v_mov_b32_e32 v31, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB6_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2094,49 +2916,114 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: s_nop 2
-; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
-; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v32, a0
; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v31
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v32
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v33
; GFX90A-NEXT: s_add_i32 s0, s0, -1
; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v32, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v33, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1
; GFX90A-NEXT: ; %bb.2: ; %exit
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_nop 12
; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2156,49 +3043,114 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
; GFX942-NEXT: s_nop 15
; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
-; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v32, a0
; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v31
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v32
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v33
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v32, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v33, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB8_1
; GFX942-NEXT: ; %bb.2: ; %exit
; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-NEXT: s_nop 11
; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
@@ -2401,7 +3353,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX90A-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2
@@ -2471,7 +3422,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg)
; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX942-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec
; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Loop Header: Depth=1
; GFX942-NEXT: ; Child Loop BB9_2 Depth 2
@@ -2615,50 +3565,114 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 {
; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s4, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v11, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v13, 0
+; GFX90A-NEXT: v_mov_b32_e32 v14, 0
+; GFX90A-NEXT: v_mov_b32_e32 v15, 0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0
+; GFX90A-NEXT: v_mov_b32_e32 v20, 0
+; GFX90A-NEXT: v_mov_b32_e32 v21, 0
+; GFX90A-NEXT: v_mov_b32_e32 v22, 0
+; GFX90A-NEXT: v_mov_b32_e32 v23, 0
+; GFX90A-NEXT: v_mov_b32_e32 v24, 0
+; GFX90A-NEXT: v_mov_b32_e32 v25, 0
+; GFX90A-NEXT: v_mov_b32_e32 v26, 0
+; GFX90A-NEXT: v_mov_b32_e32 v27, 0
+; GFX90A-NEXT: v_mov_b32_e32 v28, 0
+; GFX90A-NEXT: v_mov_b32_e32 v29, 0
+; GFX90A-NEXT: v_mov_b32_e32 v30, 0
+; GFX90A-NEXT: v_mov_b32_e32 v31, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: s_add_i32 s4, s4, -1
; GFX90A-NEXT: s_cmp_lg_u32 s4, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1
; GFX90A-NEXT: ; %bb.2: ; %exit
-; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
@@ -2696,50 +3710,114 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 {
; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v11, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v13, 0
+; GFX942-NEXT: v_mov_b32_e32 v14, 0
+; GFX942-NEXT: v_mov_b32_e32 v15, 0
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
+; GFX942-NEXT: v_mov_b32_e32 v19, 0
+; GFX942-NEXT: v_mov_b32_e32 v20, 0
+; GFX942-NEXT: v_mov_b32_e32 v21, 0
+; GFX942-NEXT: v_mov_b32_e32 v22, 0
+; GFX942-NEXT: v_mov_b32_e32 v23, 0
+; GFX942-NEXT: v_mov_b32_e32 v24, 0
+; GFX942-NEXT: v_mov_b32_e32 v25, 0
+; GFX942-NEXT: v_mov_b32_e32 v26, 0
+; GFX942-NEXT: v_mov_b32_e32 v27, 0
+; GFX942-NEXT: v_mov_b32_e32 v28, 0
+; GFX942-NEXT: v_mov_b32_e32 v29, 0
+; GFX942-NEXT: v_mov_b32_e32 v30, 0
+; GFX942-NEXT: v_mov_b32_e32 v31, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB10_1
; GFX942-NEXT: ; %bb.2: ; %exit
-; GFX942-NEXT: s_nop 14
; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
@@ -2873,50 +3951,114 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use:
; GFX90A: ; %bb.0: ; %entry
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
-; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
; GFX90A-NEXT: s_mov_b32 s4, 16
-; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mov_b32_e32 v5, 0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
+; GFX90A-NEXT: v_mov_b32_e32 v7, 0
+; GFX90A-NEXT: v_mov_b32_e32 v8, 0
+; GFX90A-NEXT: v_mov_b32_e32 v9, 0
+; GFX90A-NEXT: v_mov_b32_e32 v10, 0
+; GFX90A-NEXT: v_mov_b32_e32 v11, 0
+; GFX90A-NEXT: v_mov_b32_e32 v12, 0
+; GFX90A-NEXT: v_mov_b32_e32 v13, 0
+; GFX90A-NEXT: v_mov_b32_e32 v14, 0
+; GFX90A-NEXT: v_mov_b32_e32 v15, 0
+; GFX90A-NEXT: v_mov_b32_e32 v16, 0
+; GFX90A-NEXT: v_mov_b32_e32 v17, 0
+; GFX90A-NEXT: v_mov_b32_e32 v18, 0
+; GFX90A-NEXT: v_mov_b32_e32 v19, 0
+; GFX90A-NEXT: v_mov_b32_e32 v20, 0
+; GFX90A-NEXT: v_mov_b32_e32 v21, 0
+; GFX90A-NEXT: v_mov_b32_e32 v22, 0
+; GFX90A-NEXT: v_mov_b32_e32 v23, 0
+; GFX90A-NEXT: v_mov_b32_e32 v24, 0
+; GFX90A-NEXT: v_mov_b32_e32 v25, 0
+; GFX90A-NEXT: v_mov_b32_e32 v26, 0
+; GFX90A-NEXT: v_mov_b32_e32 v27, 0
+; GFX90A-NEXT: v_mov_b32_e32 v28, 0
+; GFX90A-NEXT: v_mov_b32_e32 v29, 0
+; GFX90A-NEXT: v_mov_b32_e32 v30, 0
+; GFX90A-NEXT: v_mov_b32_e32 v31, 0
+; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0
; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_nop 1
-; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
; GFX90A-NEXT: s_add_i32 s4, s4, -1
; GFX90A-NEXT: s_cmp_lg_u32 s4, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31]
+; GFX90A-NEXT: s_nop 15
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31
; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1
; GFX90A-NEXT: ; %bb.2: ; %exit
-; GFX90A-NEXT: s_nop 15
; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
@@ -2954,50 +4096,114 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 {
; GFX942-LABEL: test_mfma_loop_non_splat_ret_use:
; GFX942: ; %bb.0: ; %entry
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
-; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
-; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
; GFX942-NEXT: s_mov_b32 s0, 16
-; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-NEXT: v_mov_b32_e32 v6, 0
+; GFX942-NEXT: v_mov_b32_e32 v7, 0
+; GFX942-NEXT: v_mov_b32_e32 v8, 0
+; GFX942-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-NEXT: v_mov_b32_e32 v10, 0
+; GFX942-NEXT: v_mov_b32_e32 v11, 0
+; GFX942-NEXT: v_mov_b32_e32 v12, 0
+; GFX942-NEXT: v_mov_b32_e32 v13, 0
+; GFX942-NEXT: v_mov_b32_e32 v14, 0
+; GFX942-NEXT: v_mov_b32_e32 v15, 0
+; GFX942-NEXT: v_mov_b32_e32 v16, 0
+; GFX942-NEXT: v_mov_b32_e32 v17, 0
+; GFX942-NEXT: v_mov_b32_e32 v18, 0
+; GFX942-NEXT: v_mov_b32_e32 v19, 0
+; GFX942-NEXT: v_mov_b32_e32 v20, 0
+; GFX942-NEXT: v_mov_b32_e32 v21, 0
+; GFX942-NEXT: v_mov_b32_e32 v22, 0
+; GFX942-NEXT: v_mov_b32_e32 v23, 0
+; GFX942-NEXT: v_mov_b32_e32 v24, 0
+; GFX942-NEXT: v_mov_b32_e32 v25, 0
+; GFX942-NEXT: v_mov_b32_e32 v26, 0
+; GFX942-NEXT: v_mov_b32_e32 v27, 0
+; GFX942-NEXT: v_mov_b32_e32 v28, 0
+; GFX942-NEXT: v_mov_b32_e32 v29, 0
+; GFX942-NEXT: v_mov_b32_e32 v30, 0
+; GFX942-NEXT: v_mov_b32_e32 v31, 0
+; GFX942-NEXT: v_mov_b32_e32 v32, 1.0
; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader
; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT: s_nop 1
-; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
; GFX942-NEXT: s_add_i32 s0, s0, -1
; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31]
+; GFX942-NEXT: s_nop 15
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX942-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX942-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX942-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX942-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX942-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX942-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX942-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX942-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX942-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX942-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX942-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX942-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX942-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX942-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX942-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX942-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX942-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX942-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX942-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX942-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX942-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX942-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX942-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX942-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX942-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX942-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX942-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX942-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX942-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX942-NEXT: v_accvgpr_read_b32 v31, a31
; GFX942-NEXT: s_cbranch_scc1 .LBB11_1
; GFX942-NEXT: ; %bb.2: ; %exit
-; GFX942-NEXT: s_nop 14
; GFX942-NEXT: v_accvgpr_read_b32 v0, a0
; GFX942-NEXT: v_accvgpr_read_b32 v1, a1
; GFX942-NEXT: v_accvgpr_read_b32 v2, a2