; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s ; Check that we do not copy agprs to vgprs and back inside the loop. ; Final result should be read only once after the loop. define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_loop_zeroinit: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX908-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader for.cond.preheader: %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. ; 3 vgprs are needed to avoid wait states between writes. ; Check that we do not use 32 temp sgprs as well. define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_loop_unfoldable_splat: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_splat: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_unfoldable_splat: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB1_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader for.cond.preheader: %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_loop_non_splat: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_non_splat: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_non_splat: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader for.cond.preheader: %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. ; 3 vgprs are needed to avoid wait states between writes. ; FIXME: Constant is now in VGPR instead of SGPR. define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_loop_unfoldable_seq: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: v_mov_b32_e32 v0, 0x431a0000 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43190000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43180000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43170000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43160000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43150000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43140000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43130000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43120000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43110000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43100000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x430f0000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x430e0000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x430d0000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x430c0000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x430b0000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x430a0000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43090000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43080000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43070000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43060000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43050000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43040000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43030000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43020000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43010000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x43000000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x42fe0000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x42fc0000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x42fa0000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x42f80000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_unfoldable_seq: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_unfoldable_seq: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000 ; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000 ; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000 ; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000 ; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000 ; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000 ; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000 ; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000 ; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000 ; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000 ; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000 ; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000 ; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000 ; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000 ; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000 ; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000 ; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000 ; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000 ; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000 ; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000 ; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000 ; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000 ; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000 ; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000 ; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000 ; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000 ; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000 ; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000 ; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000 ; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000 ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader for.cond.preheader: %phi = phi <32 x float> [ , %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_loop_vgpr_init: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX908-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_vgpr_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_vgpr_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB4_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %init = bitcast i32 %tid to float %tmp0 = insertelement <32 x float> poison, float %init, i32 0 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 br label %for.cond.preheader for.cond.preheader: %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float %init) #0 { ; GFX908-LABEL: test_mfma_loop_sgpr_init: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dword s0, s[4:5], 0x2c ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v0, s0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_sgpr_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_sgpr_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a31, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a29, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a26, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a25, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a24, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a23, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a22, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a21, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a20, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a19, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a18, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a16, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a15, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a14, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a13, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a12, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a11, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a10, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a9, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a8, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a7, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a6, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a5, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a4, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a3, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB5_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %tmp0 = insertelement <32 x float> poison, float %init, i32 0 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 br label %for.cond.preheader for.cond.preheader: %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, float %x) #0 { ; GFX908-LABEL: test_mfma_loop_mixed_init: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: v_mov_b32_e32 v0, s1 ; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX908-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_mixed_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_mixed_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v0 ; GFX942-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: v_accvgpr_write_b32 a1, v0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %init = bitcast i32 %tid to float %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0 %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1 br label %for.cond.preheader for.cond.preheader: %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_loop_mfma_forward_init: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: .LBB7_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_mfma_forward_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; GFX90A-NEXT: .LBB7_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_mfma_forward_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 ; GFX942-NEXT: .LBB7_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) br label %for.cond.preheader for.cond.preheader: %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } ; Check that we are using only one tmp VGPR. define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_loop_agpr_init: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX908-NEXT: s_mov_b32 s0, 16 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX908-NEXT: s_nop 1 ; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a1, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a4, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a5, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a6, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a8, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a9, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a11, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a12, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a14, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a15, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a17, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a18, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a20, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a21, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a23, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a24, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a26, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a27, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a29, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a30, v2 ; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 ; GFX908-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX908-NEXT: s_add_i32 s0, s0, -1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX908-NEXT: ; %bb.2: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 5 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_loop_agpr_init: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_mov_b32 s0, 16 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: s_add_i32 s0, s0, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX90A-NEXT: ; %bb.2: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_loop_agpr_init: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_mov_b32 s0, 16 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: s_add_i32 s0, s0, -1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX942-NEXT: ; %bb.2: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) %init = extractelement <32 x float> %mai.0, i32 0 %tmp0 = insertelement <32 x float> poison, float %init, i32 0 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 br label %for.cond.preheader for.cond.preheader: %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } ; Check that we do not copy agprs to vgprs and back in an outer loop. ; Final result should be read only once after the loop. define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) #0 { ; GFX908-LABEL: test_mfma_nested_loop_zeroinit: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a3, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a4, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a5, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a6, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a7, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a8, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a9, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a10, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a11, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a12, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a13, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a14, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a15, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a16, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a17, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a18, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a19, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a20, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a21, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a22, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a23, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a24, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a25, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a26, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a27, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a28, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a29, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a30, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a31, 0 ; GFX908-NEXT: s_mov_b32 s0, 0 ; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX908-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX908-NEXT: ; =>This Loop Header: Depth=1 ; GFX908-NEXT: ; Child Loop BB9_2 Depth 2 ; GFX908-NEXT: s_mov_b32 s1, 16 ; GFX908-NEXT: .LBB9_2: ; %inner.for.cond.preheader ; GFX908-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: s_nop 0 ; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX908-NEXT: s_add_i32 s1, s1, -1 ; GFX908-NEXT: s_cmp_lg_u32 s1, 0 ; GFX908-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX908-NEXT: ; %bb.3: ; %inner.exit ; GFX908-NEXT: ; in Loop: Header=BB9_1 Depth=1 ; GFX908-NEXT: s_add_i32 s0, s0, 1 ; GFX908-NEXT: s_cmp_lg_u32 s0, 16 ; GFX908-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX908-NEXT: ; %bb.4: ; %exit ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX908-NEXT: s_nop 7 ; GFX908-NEXT: s_nop 2 ; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX908-NEXT: v_accvgpr_read_b32 v28, a28 ; GFX908-NEXT: v_accvgpr_read_b32 v29, a29 ; GFX908-NEXT: v_accvgpr_read_b32 v30, a30 ; GFX908-NEXT: v_accvgpr_read_b32 v31, a31 ; GFX908-NEXT: v_mov_b32_e32 v32, 0 ; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 ; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 ; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 ; GFX908-NEXT: v_accvgpr_read_b32 v4, a4 ; GFX908-NEXT: v_accvgpr_read_b32 v5, a5 ; GFX908-NEXT: v_accvgpr_read_b32 v6, a6 ; GFX908-NEXT: v_accvgpr_read_b32 v7, a7 ; GFX908-NEXT: v_accvgpr_read_b32 v8, a8 ; GFX908-NEXT: v_accvgpr_read_b32 v9, a9 ; GFX908-NEXT: v_accvgpr_read_b32 v10, a10 ; GFX908-NEXT: v_accvgpr_read_b32 v11, a11 ; GFX908-NEXT: v_accvgpr_read_b32 v12, a12 ; GFX908-NEXT: v_accvgpr_read_b32 v13, a13 ; GFX908-NEXT: v_accvgpr_read_b32 v14, a14 ; GFX908-NEXT: v_accvgpr_read_b32 v15, a15 ; GFX908-NEXT: v_accvgpr_read_b32 v16, a16 ; GFX908-NEXT: v_accvgpr_read_b32 v17, a17 ; GFX908-NEXT: v_accvgpr_read_b32 v18, a18 ; GFX908-NEXT: v_accvgpr_read_b32 v19, a19 ; GFX908-NEXT: v_accvgpr_read_b32 v20, a20 ; GFX908-NEXT: v_accvgpr_read_b32 v21, a21 ; GFX908-NEXT: v_accvgpr_read_b32 v22, a22 ; GFX908-NEXT: v_accvgpr_read_b32 v23, a23 ; GFX908-NEXT: v_accvgpr_read_b32 v24, a24 ; GFX908-NEXT: v_accvgpr_read_b32 v25, a25 ; GFX908-NEXT: v_accvgpr_read_b32 v26, a26 ; GFX908-NEXT: v_accvgpr_read_b32 v27, a27 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_nested_loop_zeroinit: ; GFX90A: ; %bb.0: ; %entry ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX90A-NEXT: s_mov_b32 s0, 0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2 ; GFX90A-NEXT: s_mov_b32 s1, 16 ; GFX90A-NEXT: .LBB9_2: ; %inner.for.cond.preheader ; GFX90A-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31] ; GFX90A-NEXT: s_add_i32 s1, s1, -1 ; GFX90A-NEXT: s_cmp_lg_u32 s1, 0 ; GFX90A-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX90A-NEXT: ; %bb.3: ; %inner.exit ; GFX90A-NEXT: ; in Loop: Header=BB9_1 Depth=1 ; GFX90A-NEXT: s_add_i32 s0, s0, 1 ; GFX90A-NEXT: s_cmp_lg_u32 s0, 16 ; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX90A-NEXT: ; %bb.4: ; %exit ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_nested_loop_zeroinit: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: v_accvgpr_write_b32 a0, 0 ; GFX942-NEXT: s_mov_b32 s0, 0 ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0 ; GFX942-NEXT: v_mov_b32_e32 v0, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader ; GFX942-NEXT: ; =>This Loop Header: Depth=1 ; GFX942-NEXT: ; Child Loop BB9_2 Depth 2 ; GFX942-NEXT: s_mov_b32 s1, 16 ; GFX942-NEXT: .LBB9_2: ; %inner.for.cond.preheader ; GFX942-NEXT: ; Parent Loop BB9_1 Depth=1 ; GFX942-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31] ; GFX942-NEXT: s_add_i32 s1, s1, -1 ; GFX942-NEXT: s_cmp_lg_u32 s1, 0 ; GFX942-NEXT: s_cbranch_scc1 .LBB9_2 ; GFX942-NEXT: ; %bb.3: ; %inner.exit ; GFX942-NEXT: ; in Loop: Header=BB9_1 Depth=1 ; GFX942-NEXT: s_add_i32 s0, s0, 1 ; GFX942-NEXT: s_cmp_lg_u32 s0, 16 ; GFX942-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX942-NEXT: ; %bb.4: ; %exit ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm entry: br label %for.cond.preheader for.cond.preheader: %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ] %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ] br label %inner.for.cond.preheader inner.for.cond.preheader: %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ] %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ] %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) %inc = add nuw nsw i32 %c, 1 %cc = icmp eq i32 %inc, 16 br i1 %cc, label %inner.exit, label %inner.for.cond.preheader inner.exit: %inc.0 = add nuw nsw i32 %c.0, 1 %cc.0 = icmp eq i32 %inc.0, 16 br i1 %cc.0, label %exit, label %for.cond.preheader exit: store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare i32 @llvm.amdgcn.workitem.id.x() attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }