diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/mfma-loop.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 2828 |
1 files changed, 2017 insertions, 811 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll index 4bb6538..8b6bb9b 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s +; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s ; Check that we do not copy agprs to vgprs and back inside the loop. @@ -101,53 +101,117 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -160,53 +224,117 @@ define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -332,54 +460,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0x42f60000 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v31 +; GFX90A-NEXT: v_mov_b32_e32 v1, v31 +; GFX90A-NEXT: v_mov_b32_e32 v2, v31 +; GFX90A-NEXT: v_mov_b32_e32 v3, v31 +; GFX90A-NEXT: v_mov_b32_e32 v4, v31 +; GFX90A-NEXT: v_mov_b32_e32 v5, v31 +; GFX90A-NEXT: v_mov_b32_e32 v6, v31 +; GFX90A-NEXT: v_mov_b32_e32 v7, v31 +; GFX90A-NEXT: v_mov_b32_e32 v8, v31 +; GFX90A-NEXT: v_mov_b32_e32 v9, v31 +; GFX90A-NEXT: v_mov_b32_e32 v10, v31 +; GFX90A-NEXT: v_mov_b32_e32 v11, v31 +; GFX90A-NEXT: v_mov_b32_e32 v12, v31 +; GFX90A-NEXT: v_mov_b32_e32 v13, v31 +; GFX90A-NEXT: v_mov_b32_e32 v14, v31 +; GFX90A-NEXT: v_mov_b32_e32 v15, v31 +; GFX90A-NEXT: v_mov_b32_e32 v16, v31 +; GFX90A-NEXT: v_mov_b32_e32 v17, v31 +; GFX90A-NEXT: v_mov_b32_e32 v18, v31 +; GFX90A-NEXT: v_mov_b32_e32 v19, v31 +; GFX90A-NEXT: v_mov_b32_e32 v20, v31 +; GFX90A-NEXT: v_mov_b32_e32 v21, v31 +; GFX90A-NEXT: v_mov_b32_e32 v22, v31 +; GFX90A-NEXT: v_mov_b32_e32 v23, v31 +; GFX90A-NEXT: v_mov_b32_e32 v24, v31 +; GFX90A-NEXT: v_mov_b32_e32 v25, v31 +; GFX90A-NEXT: v_mov_b32_e32 v26, v31 +; GFX90A-NEXT: v_mov_b32_e32 v27, v31 +; GFX90A-NEXT: v_mov_b32_e32 v28, v31 +; GFX90A-NEXT: v_mov_b32_e32 v29, v31 +; GFX90A-NEXT: v_mov_b32_e32 v30, v31 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -392,54 +583,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg ; ; GFX942-LABEL: test_mfma_loop_unfoldable_splat: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0x42f60000 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, v31 +; GFX942-NEXT: v_mov_b32_e32 v1, v31 +; GFX942-NEXT: v_mov_b32_e32 v2, v31 +; GFX942-NEXT: v_mov_b32_e32 v3, v31 +; GFX942-NEXT: v_mov_b32_e32 v4, v31 +; GFX942-NEXT: v_mov_b32_e32 v5, v31 +; GFX942-NEXT: v_mov_b32_e32 v6, v31 +; GFX942-NEXT: v_mov_b32_e32 v7, v31 +; GFX942-NEXT: v_mov_b32_e32 v8, v31 +; GFX942-NEXT: v_mov_b32_e32 v9, v31 +; GFX942-NEXT: v_mov_b32_e32 v10, v31 +; GFX942-NEXT: v_mov_b32_e32 v11, v31 +; GFX942-NEXT: v_mov_b32_e32 v12, v31 +; GFX942-NEXT: v_mov_b32_e32 v13, v31 +; GFX942-NEXT: v_mov_b32_e32 v14, v31 +; GFX942-NEXT: v_mov_b32_e32 v15, v31 +; GFX942-NEXT: v_mov_b32_e32 v16, v31 +; GFX942-NEXT: v_mov_b32_e32 v17, v31 +; GFX942-NEXT: v_mov_b32_e32 v18, v31 +; GFX942-NEXT: v_mov_b32_e32 v19, v31 +; GFX942-NEXT: v_mov_b32_e32 v20, v31 +; GFX942-NEXT: v_mov_b32_e32 v21, v31 +; GFX942-NEXT: v_mov_b32_e32 v22, v31 +; GFX942-NEXT: v_mov_b32_e32 v23, v31 +; GFX942-NEXT: v_mov_b32_e32 v24, v31 +; GFX942-NEXT: v_mov_b32_e32 v25, v31 +; GFX942-NEXT: v_mov_b32_e32 v26, v31 +; GFX942-NEXT: v_mov_b32_e32 v27, v31 +; GFX942-NEXT: v_mov_b32_e32 v28, v31 +; GFX942-NEXT: v_mov_b32_e32 v29, v31 +; GFX942-NEXT: v_mov_b32_e32 v30, v31 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -559,53 +813,117 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -618,53 +936,117 @@ define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -821,85 +1203,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f80000 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x42fa0000 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0x42fc0000 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0x42fe0000 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0x43000000 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0x43010000 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0x43020000 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0x43030000 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0x43040000 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0x43050000 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0x43060000 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0x43070000 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0x43080000 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0x43090000 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0x430a0000 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0x430b0000 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0x430c0000 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0x430d0000 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0x430e0000 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0x430f0000 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0x43100000 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0x43110000 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0x43120000 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0x43130000 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0x43140000 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0x43150000 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0x43160000 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0x43170000 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0x43180000 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0x43190000 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0x431a0000 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -912,85 +1326,117 @@ define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) ; ; GFX942-LABEL: test_mfma_loop_unfoldable_seq: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f80000 +; GFX942-NEXT: v_mov_b32_e32 v2, 0x42fa0000 +; GFX942-NEXT: v_mov_b32_e32 v3, 0x42fc0000 +; GFX942-NEXT: v_mov_b32_e32 v4, 0x42fe0000 +; GFX942-NEXT: v_mov_b32_e32 v5, 0x43000000 +; GFX942-NEXT: v_mov_b32_e32 v6, 0x43010000 +; GFX942-NEXT: v_mov_b32_e32 v7, 0x43020000 +; GFX942-NEXT: v_mov_b32_e32 v8, 0x43030000 +; GFX942-NEXT: v_mov_b32_e32 v9, 0x43040000 +; GFX942-NEXT: v_mov_b32_e32 v10, 0x43050000 +; GFX942-NEXT: v_mov_b32_e32 v11, 0x43060000 +; GFX942-NEXT: v_mov_b32_e32 v12, 0x43070000 +; GFX942-NEXT: v_mov_b32_e32 v13, 0x43080000 +; GFX942-NEXT: v_mov_b32_e32 v14, 0x43090000 +; GFX942-NEXT: v_mov_b32_e32 v15, 0x430a0000 +; GFX942-NEXT: v_mov_b32_e32 v16, 0x430b0000 +; GFX942-NEXT: v_mov_b32_e32 v17, 0x430c0000 +; GFX942-NEXT: v_mov_b32_e32 v18, 0x430d0000 +; GFX942-NEXT: v_mov_b32_e32 v19, 0x430e0000 +; GFX942-NEXT: v_mov_b32_e32 v20, 0x430f0000 +; GFX942-NEXT: v_mov_b32_e32 v21, 0x43100000 +; GFX942-NEXT: v_mov_b32_e32 v22, 0x43110000 +; GFX942-NEXT: v_mov_b32_e32 v23, 0x43120000 +; GFX942-NEXT: v_mov_b32_e32 v24, 0x43130000 +; GFX942-NEXT: v_mov_b32_e32 v25, 0x43140000 +; GFX942-NEXT: v_mov_b32_e32 v26, 0x43150000 +; GFX942-NEXT: v_mov_b32_e32 v27, 0x43160000 +; GFX942-NEXT: v_mov_b32_e32 v28, 0x43170000 +; GFX942-NEXT: v_mov_b32_e32 v29, 0x43180000 +; GFX942-NEXT: v_mov_b32_e32 v30, 0x43190000 +; GFX942-NEXT: v_mov_b32_e32 v31, 0x431a0000 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1110,54 +1556,117 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; ; GFX90A-LABEL: test_mfma_loop_vgpr_init: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_and_b32_e32 v31, 0x3ff, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, v31 +; GFX90A-NEXT: v_mov_b32_e32 v1, v31 +; GFX90A-NEXT: v_mov_b32_e32 v2, v31 +; GFX90A-NEXT: v_mov_b32_e32 v3, v31 +; GFX90A-NEXT: v_mov_b32_e32 v4, v31 +; GFX90A-NEXT: v_mov_b32_e32 v5, v31 +; GFX90A-NEXT: v_mov_b32_e32 v6, v31 +; GFX90A-NEXT: v_mov_b32_e32 v7, v31 +; GFX90A-NEXT: v_mov_b32_e32 v8, v31 +; GFX90A-NEXT: v_mov_b32_e32 v9, v31 +; GFX90A-NEXT: v_mov_b32_e32 v10, v31 +; GFX90A-NEXT: v_mov_b32_e32 v11, v31 +; GFX90A-NEXT: v_mov_b32_e32 v12, v31 +; GFX90A-NEXT: v_mov_b32_e32 v13, v31 +; GFX90A-NEXT: v_mov_b32_e32 v14, v31 +; GFX90A-NEXT: v_mov_b32_e32 v15, v31 +; GFX90A-NEXT: v_mov_b32_e32 v16, v31 +; GFX90A-NEXT: v_mov_b32_e32 v17, v31 +; GFX90A-NEXT: v_mov_b32_e32 v18, v31 +; GFX90A-NEXT: v_mov_b32_e32 v19, v31 +; GFX90A-NEXT: v_mov_b32_e32 v20, v31 +; GFX90A-NEXT: v_mov_b32_e32 v21, v31 +; GFX90A-NEXT: v_mov_b32_e32 v22, v31 +; GFX90A-NEXT: v_mov_b32_e32 v23, v31 +; GFX90A-NEXT: v_mov_b32_e32 v24, v31 +; GFX90A-NEXT: v_mov_b32_e32 v25, v31 +; GFX90A-NEXT: v_mov_b32_e32 v26, v31 +; GFX90A-NEXT: v_mov_b32_e32 v27, v31 +; GFX90A-NEXT: v_mov_b32_e32 v28, v31 +; GFX90A-NEXT: v_mov_b32_e32 v29, v31 +; GFX90A-NEXT: v_mov_b32_e32 v30, v31 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1170,54 +1679,117 @@ define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; ; GFX942-LABEL: test_mfma_loop_vgpr_init: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_and_b32_e32 v31, 0x3ff, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, v31 +; GFX942-NEXT: v_mov_b32_e32 v1, v31 +; GFX942-NEXT: v_mov_b32_e32 v2, v31 +; GFX942-NEXT: v_mov_b32_e32 v3, v31 +; GFX942-NEXT: v_mov_b32_e32 v4, v31 +; GFX942-NEXT: v_mov_b32_e32 v5, v31 +; GFX942-NEXT: v_mov_b32_e32 v6, v31 +; GFX942-NEXT: v_mov_b32_e32 v7, v31 +; GFX942-NEXT: v_mov_b32_e32 v8, v31 +; GFX942-NEXT: v_mov_b32_e32 v9, v31 +; GFX942-NEXT: v_mov_b32_e32 v10, v31 +; GFX942-NEXT: v_mov_b32_e32 v11, v31 +; GFX942-NEXT: v_mov_b32_e32 v12, v31 +; GFX942-NEXT: v_mov_b32_e32 v13, v31 +; GFX942-NEXT: v_mov_b32_e32 v14, v31 +; GFX942-NEXT: v_mov_b32_e32 v15, v31 +; GFX942-NEXT: v_mov_b32_e32 v16, v31 +; GFX942-NEXT: v_mov_b32_e32 v17, v31 +; GFX942-NEXT: v_mov_b32_e32 v18, v31 +; GFX942-NEXT: v_mov_b32_e32 v19, v31 +; GFX942-NEXT: v_mov_b32_e32 v20, v31 +; GFX942-NEXT: v_mov_b32_e32 v21, v31 +; GFX942-NEXT: v_mov_b32_e32 v22, v31 +; GFX942-NEXT: v_mov_b32_e32 v23, v31 +; GFX942-NEXT: v_mov_b32_e32 v24, v31 +; GFX942-NEXT: v_mov_b32_e32 v25, v31 +; GFX942-NEXT: v_mov_b32_e32 v26, v31 +; GFX942-NEXT: v_mov_b32_e32 v27, v31 +; GFX942-NEXT: v_mov_b32_e32 v28, v31 +; GFX942-NEXT: v_mov_b32_e32 v29, v31 +; GFX942-NEXT: v_mov_b32_e32 v30, v31 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1378,54 +1950,117 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v31, s1 ; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v2, s1 +; GFX90A-NEXT: v_mov_b32_e32 v3, s1 +; GFX90A-NEXT: v_mov_b32_e32 v4, s1 +; GFX90A-NEXT: v_mov_b32_e32 v5, s1 +; GFX90A-NEXT: v_mov_b32_e32 v6, s1 +; GFX90A-NEXT: v_mov_b32_e32 v7, s1 +; GFX90A-NEXT: v_mov_b32_e32 v8, s1 +; GFX90A-NEXT: v_mov_b32_e32 v9, s1 +; GFX90A-NEXT: v_mov_b32_e32 v10, s1 +; GFX90A-NEXT: v_mov_b32_e32 v11, s1 +; GFX90A-NEXT: v_mov_b32_e32 v12, s1 +; GFX90A-NEXT: v_mov_b32_e32 v13, s1 +; GFX90A-NEXT: v_mov_b32_e32 v14, s1 +; GFX90A-NEXT: v_mov_b32_e32 v15, s1 +; GFX90A-NEXT: v_mov_b32_e32 v16, s1 +; GFX90A-NEXT: v_mov_b32_e32 v17, s1 +; GFX90A-NEXT: v_mov_b32_e32 v18, s1 +; GFX90A-NEXT: v_mov_b32_e32 v19, s1 +; GFX90A-NEXT: v_mov_b32_e32 v20, s1 +; GFX90A-NEXT: v_mov_b32_e32 v21, s1 +; GFX90A-NEXT: v_mov_b32_e32 v22, s1 +; GFX90A-NEXT: v_mov_b32_e32 v23, s1 +; GFX90A-NEXT: v_mov_b32_e32 v24, s1 +; GFX90A-NEXT: v_mov_b32_e32 v25, s1 +; GFX90A-NEXT: v_mov_b32_e32 v26, s1 +; GFX90A-NEXT: v_mov_b32_e32 v27, s1 +; GFX90A-NEXT: v_mov_b32_e32 v28, s1 +; GFX90A-NEXT: v_mov_b32_e32 v29, s1 +; GFX90A-NEXT: v_mov_b32_e32 v30, s1 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1440,54 +2075,117 @@ define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v31, s1 ; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: v_mov_b32_e32 v2, s1 +; GFX942-NEXT: v_mov_b32_e32 v3, s1 +; GFX942-NEXT: v_mov_b32_e32 v4, s1 +; GFX942-NEXT: v_mov_b32_e32 v5, s1 +; GFX942-NEXT: v_mov_b32_e32 v6, s1 +; GFX942-NEXT: v_mov_b32_e32 v7, s1 +; GFX942-NEXT: v_mov_b32_e32 v8, s1 +; GFX942-NEXT: v_mov_b32_e32 v9, s1 +; GFX942-NEXT: v_mov_b32_e32 v10, s1 +; GFX942-NEXT: v_mov_b32_e32 v11, s1 +; GFX942-NEXT: v_mov_b32_e32 v12, s1 +; GFX942-NEXT: v_mov_b32_e32 v13, s1 +; GFX942-NEXT: v_mov_b32_e32 v14, s1 +; GFX942-NEXT: v_mov_b32_e32 v15, s1 +; GFX942-NEXT: v_mov_b32_e32 v16, s1 +; GFX942-NEXT: v_mov_b32_e32 v17, s1 +; GFX942-NEXT: v_mov_b32_e32 v18, s1 +; GFX942-NEXT: v_mov_b32_e32 v19, s1 +; GFX942-NEXT: v_mov_b32_e32 v20, s1 +; GFX942-NEXT: v_mov_b32_e32 v21, s1 +; GFX942-NEXT: v_mov_b32_e32 v22, s1 +; GFX942-NEXT: v_mov_b32_e32 v23, s1 +; GFX942-NEXT: v_mov_b32_e32 v24, s1 +; GFX942-NEXT: v_mov_b32_e32 v25, s1 +; GFX942-NEXT: v_mov_b32_e32 v26, s1 +; GFX942-NEXT: v_mov_b32_e32 v27, s1 +; GFX942-NEXT: v_mov_b32_e32 v28, s1 +; GFX942-NEXT: v_mov_b32_e32 v29, s1 +; GFX942-NEXT: v_mov_b32_e32 v30, s1 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1644,56 +2342,118 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX90A-NEXT: s_mov_b32 s0, 16 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s1 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: s_mov_b32 s0, 16 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v1, s1 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -1707,56 +2467,118 @@ define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, floa ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c +; GFX942-NEXT: s_mov_b32 s0, 16 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, s1 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v1, s1 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2094,49 +2916,114 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: s_nop 2 -; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v33, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a0 ; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v31 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v32 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v33 ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v32, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v33, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_nop 12 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2156,49 +3043,114 @@ define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 ; GFX942-NEXT: s_nop 15 ; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 -; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v33, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v32, a0 ; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v31 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v32 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v33 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v32, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v33, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_nop 11 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 @@ -2401,7 +3353,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX90A-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2 @@ -2471,7 +3422,6 @@ define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 -; GFX942-NEXT: ; kill: def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 killed $exec ; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB9_2 Depth 2 @@ -2615,50 +3565,114 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s4, s4, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2696,50 +3710,114 @@ define <32 x float> @test_mfma_loop_zeroinit_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_zeroinit_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB10_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: s_nop 14 ; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2873,50 +3951,114 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX90A-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s4, 16 -; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 +; GFX90A-NEXT: v_mov_b32_e32 v7, 0 +; GFX90A-NEXT: v_mov_b32_e32 v8, 0 +; GFX90A-NEXT: v_mov_b32_e32 v9, 0 +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 +; GFX90A-NEXT: v_mov_b32_e32 v11, 0 +; GFX90A-NEXT: v_mov_b32_e32 v12, 0 +; GFX90A-NEXT: v_mov_b32_e32 v13, 0 +; GFX90A-NEXT: v_mov_b32_e32 v14, 0 +; GFX90A-NEXT: v_mov_b32_e32 v15, 0 +; GFX90A-NEXT: v_mov_b32_e32 v16, 0 +; GFX90A-NEXT: v_mov_b32_e32 v17, 0 +; GFX90A-NEXT: v_mov_b32_e32 v18, 0 +; GFX90A-NEXT: v_mov_b32_e32 v19, 0 +; GFX90A-NEXT: v_mov_b32_e32 v20, 0 +; GFX90A-NEXT: v_mov_b32_e32 v21, 0 +; GFX90A-NEXT: v_mov_b32_e32 v22, 0 +; GFX90A-NEXT: v_mov_b32_e32 v23, 0 +; GFX90A-NEXT: v_mov_b32_e32 v24, 0 +; GFX90A-NEXT: v_mov_b32_e32 v25, 0 +; GFX90A-NEXT: v_mov_b32_e32 v26, 0 +; GFX90A-NEXT: v_mov_b32_e32 v27, 0 +; GFX90A-NEXT: v_mov_b32_e32 v28, 0 +; GFX90A-NEXT: v_mov_b32_e32 v29, 0 +; GFX90A-NEXT: v_mov_b32_e32 v30, 0 +; GFX90A-NEXT: v_mov_b32_e32 v31, 0 +; GFX90A-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX90A-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_nop 1 -; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX90A-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: s_add_i32 s4, s4, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s4, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v32, v0, a[0:31] +; GFX90A-NEXT: s_nop 15 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX90A-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX90A-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX90A-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX90A-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX90A-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX90A-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX90A-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX90A-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX90A-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX90A-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX90A-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX90A-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX90A-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX90A-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX90A-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX90A-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX90A-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX90A-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX90A-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX90A-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX90A-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX90A-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX90A-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX90A-NEXT: ; %bb.2: ; %exit -; GFX90A-NEXT: s_nop 15 ; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 @@ -2954,50 +4096,114 @@ define <32 x float> @test_mfma_loop_non_splat_ret_use() #0 { ; GFX942-LABEL: test_mfma_loop_non_splat_ret_use: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 -; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 -; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 -; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: v_mov_b32_e32 v5, 0 +; GFX942-NEXT: v_mov_b32_e32 v6, 0 +; GFX942-NEXT: v_mov_b32_e32 v7, 0 +; GFX942-NEXT: v_mov_b32_e32 v8, 0 +; GFX942-NEXT: v_mov_b32_e32 v9, 0 +; GFX942-NEXT: v_mov_b32_e32 v10, 0 +; GFX942-NEXT: v_mov_b32_e32 v11, 0 +; GFX942-NEXT: v_mov_b32_e32 v12, 0 +; GFX942-NEXT: v_mov_b32_e32 v13, 0 +; GFX942-NEXT: v_mov_b32_e32 v14, 0 +; GFX942-NEXT: v_mov_b32_e32 v15, 0 +; GFX942-NEXT: v_mov_b32_e32 v16, 0 +; GFX942-NEXT: v_mov_b32_e32 v17, 0 +; GFX942-NEXT: v_mov_b32_e32 v18, 0 +; GFX942-NEXT: v_mov_b32_e32 v19, 0 +; GFX942-NEXT: v_mov_b32_e32 v20, 0 +; GFX942-NEXT: v_mov_b32_e32 v21, 0 +; GFX942-NEXT: v_mov_b32_e32 v22, 0 +; GFX942-NEXT: v_mov_b32_e32 v23, 0 +; GFX942-NEXT: v_mov_b32_e32 v24, 0 +; GFX942-NEXT: v_mov_b32_e32 v25, 0 +; GFX942-NEXT: v_mov_b32_e32 v26, 0 +; GFX942-NEXT: v_mov_b32_e32 v27, 0 +; GFX942-NEXT: v_mov_b32_e32 v28, 0 +; GFX942-NEXT: v_mov_b32_e32 v29, 0 +; GFX942-NEXT: v_mov_b32_e32 v30, 0 +; GFX942-NEXT: v_mov_b32_e32 v31, 0 +; GFX942-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-NEXT: .LBB11_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX942-NEXT: s_nop 1 -; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] +; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX942-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX942-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX942-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX942-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX942-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX942-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX942-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX942-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX942-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX942-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX942-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX942-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX942-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX942-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX942-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX942-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX942-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX942-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX942-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX942-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX942-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX942-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX942-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX942-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX942-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX942-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX942-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX942-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX942-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX942-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 +; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v32, v0, a[0:31] +; GFX942-NEXT: s_nop 15 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX942-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX942-NEXT: v_accvgpr_read_b32 v4, a4 +; GFX942-NEXT: v_accvgpr_read_b32 v5, a5 +; GFX942-NEXT: v_accvgpr_read_b32 v6, a6 +; GFX942-NEXT: v_accvgpr_read_b32 v7, a7 +; GFX942-NEXT: v_accvgpr_read_b32 v8, a8 +; GFX942-NEXT: v_accvgpr_read_b32 v9, a9 +; GFX942-NEXT: v_accvgpr_read_b32 v10, a10 +; GFX942-NEXT: v_accvgpr_read_b32 v11, a11 +; GFX942-NEXT: v_accvgpr_read_b32 v12, a12 +; GFX942-NEXT: v_accvgpr_read_b32 v13, a13 +; GFX942-NEXT: v_accvgpr_read_b32 v14, a14 +; GFX942-NEXT: v_accvgpr_read_b32 v15, a15 +; GFX942-NEXT: v_accvgpr_read_b32 v16, a16 +; GFX942-NEXT: v_accvgpr_read_b32 v17, a17 +; GFX942-NEXT: v_accvgpr_read_b32 v18, a18 +; GFX942-NEXT: v_accvgpr_read_b32 v19, a19 +; GFX942-NEXT: v_accvgpr_read_b32 v20, a20 +; GFX942-NEXT: v_accvgpr_read_b32 v21, a21 +; GFX942-NEXT: v_accvgpr_read_b32 v22, a22 +; GFX942-NEXT: v_accvgpr_read_b32 v23, a23 +; GFX942-NEXT: v_accvgpr_read_b32 v24, a24 +; GFX942-NEXT: v_accvgpr_read_b32 v25, a25 +; GFX942-NEXT: v_accvgpr_read_b32 v26, a26 +; GFX942-NEXT: v_accvgpr_read_b32 v27, a27 +; GFX942-NEXT: v_accvgpr_read_b32 v28, a28 +; GFX942-NEXT: v_accvgpr_read_b32 v29, a29 +; GFX942-NEXT: v_accvgpr_read_b32 v30, a30 +; GFX942-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX942-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX942-NEXT: ; %bb.2: ; %exit -; GFX942-NEXT: s_nop 14 ; GFX942-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX942-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX942-NEXT: v_accvgpr_read_b32 v2, a2 |
