; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY908 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -early-live-intervals -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY942 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GREEDY90A-GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -sgpr-regalloc=fast -vgpr-regalloc=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=FAST90A %s ; This is better with 90a ; Check that Dst and SrcC of MFMA instructions reading more than 4 registers as SrcC ; is either completely disjoint or exactly the same, but does not alias. declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-LABEL: test_mfma_f32_32x32x1f32: ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: v_mov_b32_e32 v0, s16 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s17 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s18 ; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v1 ; GREEDY908-NEXT: v_mov_b32_e32 v0, s22 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s23 ; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 ; GREEDY908-NEXT: v_mov_b32_e32 v0, s24 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s25 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s26 ; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v2 ; GREEDY908-NEXT: v_mov_b32_e32 v0, s27 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s28 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s29 ; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v2 ; GREEDY908-NEXT: v_mov_b32_e32 v0, s30 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s31 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2 ; GREEDY908-NEXT: v_mov_b32_e32 v0, s1 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s2 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s3 ; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v2 ; GREEDY908-NEXT: v_mov_b32_e32 v0, s4 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s5 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v2 ; GREEDY908-NEXT: v_mov_b32_e32 v0, s7 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s8 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s9 ; GREEDY908-NEXT: v_mov_b32_e32 v3, s19 ; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v2 ; GREEDY908-NEXT: v_mov_b32_e32 v0, s10 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s11 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s12 ; GREEDY908-NEXT: v_mov_b32_e32 v4, s20 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s21 ; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v3 ; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v2 ; GREEDY908-NEXT: v_mov_b32_e32 v0, s13 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s14 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s15 ; GREEDY908-NEXT: v_mov_b32_e32 v3, 1.0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v4 ; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v5 ; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v2 ; GREEDY908-NEXT: v_mov_b32_e32 v0, 2.0 ; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32 ; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61 ; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60 ; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33 ; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59 ; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58 ; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34 ; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57 ; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56 ; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35 ; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55 ; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54 ; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36 ; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53 ; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52 ; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37 ; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51 ; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50 ; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38 ; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49 ; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48 ; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39 ; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46 ; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40 ; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19 ; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41 ; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18 ; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17 ; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42 ; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16 ; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15 ; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43 ; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14 ; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13 ; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44 ; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12 ; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11 ; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45 ; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10 ; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9 ; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8 ; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7 ; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a27 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a26 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a25 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a24 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a31 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a30 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a29 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a28 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a19 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a18 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a17 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a16 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a23 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a22 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a21 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a20 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; GREEDY908-NEXT: s_endpgm ; ; GREEDY90A-LABEL: test_mfma_f32_32x32x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s16 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s17 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s18 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s19 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, s20 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, s21 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, s22 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, s23 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, s24 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, s25 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, s26 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, s27 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, s28 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, s29 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, s30 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, s31 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a16, s0 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a17, s1 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s2 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s3 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s4 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s5 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s6 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s7 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s8 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s9 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s10 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s11 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s12 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s13 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s14 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s15 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 2 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a34 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a35 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a36 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a37 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a38 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a39 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a40 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a41 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a42 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a43 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a44 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a45 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a16, a46 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a17, a47 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a18, a48 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a19, a49 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a20, a50 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a21, a51 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a22, a52 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a23, a53 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a24, a54 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a25, a55 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a26, a56 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a27, a57 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a28, a58 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a29, a59 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a30, a60 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a31, a61 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 2 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[16:19], s[34:35] offset:64 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[20:23], s[34:35] offset:80 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[34:35] offset:32 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[34:35] offset:48 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[34:35] ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[34:35] offset:16 ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_32x32x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s16 ; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s17 ; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s18 ; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s19 ; GREEDY942-NEXT: v_accvgpr_write_b32 a4, s20 ; GREEDY942-NEXT: v_accvgpr_write_b32 a5, s21 ; GREEDY942-NEXT: v_accvgpr_write_b32 a6, s22 ; GREEDY942-NEXT: v_accvgpr_write_b32 a7, s23 ; GREEDY942-NEXT: v_accvgpr_write_b32 a8, s24 ; GREEDY942-NEXT: v_accvgpr_write_b32 a9, s25 ; GREEDY942-NEXT: v_accvgpr_write_b32 a10, s26 ; GREEDY942-NEXT: v_accvgpr_write_b32 a11, s27 ; GREEDY942-NEXT: v_accvgpr_write_b32 a12, s28 ; GREEDY942-NEXT: v_accvgpr_write_b32 a13, s29 ; GREEDY942-NEXT: v_accvgpr_write_b32 a14, s30 ; GREEDY942-NEXT: v_accvgpr_write_b32 a15, s31 ; GREEDY942-NEXT: v_accvgpr_write_b32 a16, s0 ; GREEDY942-NEXT: v_accvgpr_write_b32 a17, s1 ; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s2 ; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s3 ; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s4 ; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s5 ; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s6 ; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s7 ; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s8 ; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s9 ; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s10 ; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s11 ; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s12 ; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s13 ; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s14 ; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s15 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[32:63], v0, v1, a[0:31] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a34 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a35 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a36 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a37 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a38 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a39 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a40 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a41 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a42 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a43 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a44 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a45 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a16, a46 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a17, a47 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a18, a48 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a19, a49 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a20, a50 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a21, a51 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a22, a52 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a23, a53 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a24, a54 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a25, a55 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a26, a56 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a27, a57 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a28, a58 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a29, a59 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a30, a60 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a31, a61 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[24:27], s[34:35] offset:96 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[28:31], s[34:35] offset:112 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[16:19], s[34:35] offset:64 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[20:23], s[34:35] offset:80 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[34:35] offset:32 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[34:35] offset:48 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[34:35] ; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[34:35] offset:16 ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_32x32x1f32: ; GREEDY90A-GISEL: ; %bb.0: ; %bb ; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 ; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a16, s16 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a17, s17 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a18, s18 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a19, s19 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a20, s20 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a21, s21 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a22, s22 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a23, s23 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a24, s24 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a25, s25 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a26, s26 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a27, s27 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a28, s28 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a29, s29 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a30, s30 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a31, s31 ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v0, v1, a[0:31] ; GREEDY90A-GISEL-NEXT: s_nop 7 ; GREEDY90A-GISEL-NEXT: s_nop 7 ; GREEDY90A-GISEL-NEXT: s_nop 2 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a32 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a33 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a34 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a5, a35 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a6, a36 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a7, a37 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a8, a38 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a9, a39 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a10, a40 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a11, a41 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a12, a42 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a13, a43 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a14, a44 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a15, a45 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a16, a46 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a17, a47 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a18, a48 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a19, a49 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a20, a50 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a21, a51 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a22, a52 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a23, a53 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a24, a54 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a25, a55 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a26, a56 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a27, a57 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a28, a58 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a29, a59 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a30, a60 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a31, a61 ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GREEDY90A-GISEL-NEXT: s_nop 7 ; GREEDY90A-GISEL-NEXT: s_nop 7 ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 ; GREEDY90A-GISEL-NEXT: s_endpgm ; ; FAST90A-LABEL: test_mfma_f32_32x32x1f32: ; FAST90A: ; %bb.0: ; %bb ; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 ; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 ; FAST90A-NEXT: v_mov_b32_e32 v0, 0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0 ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36 ; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37 ; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38 ; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39 ; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40 ; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41 ; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42 ; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43 ; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44 ; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45 ; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46 ; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47 ; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48 ; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49 ; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50 ; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51 ; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4 ; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5 ; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6 ; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7 ; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8 ; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9 ; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10 ; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11 ; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12 ; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13 ; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14 ; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15 ; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16 ; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17 ; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18 ; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63] ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63] ; FAST90A-NEXT: s_nop 7 ; FAST90A-NEXT: s_nop 7 ; FAST90A-NEXT: s_nop 2 ; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29 ; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28 ; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27 ; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26 ; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25 ; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24 ; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23 ; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22 ; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21 ; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20 ; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19 ; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18 ; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17 ; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16 ; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15 ; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14 ; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13 ; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12 ; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11 ; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10 ; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9 ; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8 ; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7 ; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6 ; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5 ; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4 ; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3 ; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2 ; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1 ; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0 ; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32 ; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33 ; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32 ; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31 ; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30 ; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29 ; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28 ; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27 ; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26 ; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25 ; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24 ; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23 ; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22 ; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21 ; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20 ; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19 ; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18 ; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17 ; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16 ; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15 ; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14 ; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13 ; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12 ; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11 ; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10 ; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9 ; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8 ; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7 ; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6 ; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5 ; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4 ; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; FAST90A-NEXT: s_nop 7 ; FAST90A-NEXT: s_nop 7 ; FAST90A-NEXT: s_nop 2 ; FAST90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; FAST90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; FAST90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; FAST90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; FAST90A-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) %tmp.1 = shufflevector <32 x float> %mai.2, <32 x float> %mai.1, <32 x i32> %mai.3 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %tmp.1, i32 0, i32 0, i32 0) store <32 x float> %mai.3, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: v_mov_b32_e32 v5, s15 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s14 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s13 ; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s12 ; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s11 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s10 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s9 ; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s8 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s7 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s6 ; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s5 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s4 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 ; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s2 ; GREEDY908-NEXT: v_mov_b32_e32 v1, s1 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1 ; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY908-NEXT: s_nop 7 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a15 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GREEDY908-NEXT: s_endpgm ; ; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33] ; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18 ; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY90A-NEXT: s_nop 7 ; GREEDY90A-NEXT: s_nop 2 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15 ; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14 ; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13 ; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12 ; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11 ; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10 ; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9 ; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8 ; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7 ; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6 ; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5 ; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4 ; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3 ; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2 ; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1 ; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33] ; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 0 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18 ; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] ; GREEDY942-NEXT: s_nop 7 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17] ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32: ; GREEDY90A-GISEL: ; %bb.0: ; %bb ; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a4, s4 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a5, s5 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a6, s6 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a7, s7 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a8, s8 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a9, s9 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a10, s10 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a11, s11 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a12, s12 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a13, s13 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a14, s14 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a15, s15 ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[0:15] ; GREEDY90A-GISEL-NEXT: s_nop 7 ; GREEDY90A-GISEL-NEXT: s_nop 2 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a2, a16 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a3, a17 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a4, a18 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a5, a19 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a6, a20 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a7, a21 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a8, a22 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a9, a23 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a10, a24 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a11, a25 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a12, a26 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a13, a27 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a14, a28 ; GREEDY90A-GISEL-NEXT: v_accvgpr_mov_b32 a15, a29 ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GREEDY90A-GISEL-NEXT: s_nop 7 ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GREEDY90A-GISEL-NEXT: s_endpgm ; ; FAST90A-LABEL: test_mfma_f32_16x16x1f32: ; FAST90A: ; %bb.0: ; %bb ; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 ; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 ; FAST90A-NEXT: v_mov_b32_e32 v0, 0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4 ; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5 ; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6 ; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7 ; FAST90A-NEXT: v_accvgpr_write_b32 a4, s8 ; FAST90A-NEXT: v_accvgpr_write_b32 a5, s9 ; FAST90A-NEXT: v_accvgpr_write_b32 a6, s10 ; FAST90A-NEXT: v_accvgpr_write_b32 a7, s11 ; FAST90A-NEXT: v_accvgpr_write_b32 a8, s12 ; FAST90A-NEXT: v_accvgpr_write_b32 a9, s13 ; FAST90A-NEXT: v_accvgpr_write_b32 a10, s14 ; FAST90A-NEXT: v_accvgpr_write_b32 a11, s15 ; FAST90A-NEXT: v_accvgpr_write_b32 a12, s16 ; FAST90A-NEXT: v_accvgpr_write_b32 a13, s17 ; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18 ; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] ; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15] ; FAST90A-NEXT: s_nop 7 ; FAST90A-NEXT: s_nop 2 ; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16 ; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17 ; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18 ; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a19 ; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a20 ; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a21 ; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a22 ; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a23 ; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a24 ; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a25 ; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a26 ; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a27 ; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28 ; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] ; FAST90A-NEXT: s_nop 7 ; FAST90A-NEXT: s_nop 2 ; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; FAST90A-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) %tmp.1 = shufflevector <16 x float> %mai.2, <16 x float> %mai.1, <16 x i32> %mai.3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %tmp.1, i32 0, i32 0, i32 0) store <16 x float> %mai.3, ptr addrspace(1) %arg ret void } ; This instruction allows the overlap since it only read 4 registers. define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; GREEDY908-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY908: ; %bb.0: ; %bb ; GREEDY908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY908-NEXT: v_mov_b32_e32 v4, 0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GREEDY908-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY908-NEXT: v_mov_b32_e32 v5, s0 ; GREEDY908-NEXT: v_mov_b32_e32 v2, s1 ; GREEDY908-NEXT: v_mov_b32_e32 v3, s2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v5 ; GREEDY908-NEXT: v_mov_b32_e32 v5, s3 ; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2 ; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v3 ; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v5 ; GREEDY908-NEXT: s_nop 0 ; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; GREEDY908-NEXT: s_nop 3 ; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0 ; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1 ; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2 ; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3 ; GREEDY908-NEXT: s_nop 1 ; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; GREEDY908-NEXT: s_endpgm ; ; GREEDY90A-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY90A: ; %bb.0: ; %bb ; GREEDY90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] ; GREEDY90A-NEXT: s_nop 1 ; GREEDY90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; GREEDY90A-NEXT: s_nop 4 ; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] ; GREEDY90A-NEXT: s_endpgm ; ; GREEDY942-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY942: ; %bb.0: ; %bb ; GREEDY942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY942-NEXT: v_mov_b32_e32 v2, 0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GREEDY942-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY942-NEXT: v_accvgpr_write_b32 a0, s0 ; GREEDY942-NEXT: v_accvgpr_write_b32 a1, s1 ; GREEDY942-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY942-NEXT: v_accvgpr_write_b32 a3, s3 ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[2:5], v0, v1, a[0:3] ; GREEDY942-NEXT: s_nop 1 ; GREEDY942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] ; GREEDY942-NEXT: s_nop 3 ; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] ; GREEDY942-NEXT: s_endpgm ; ; GREEDY90A-GISEL-LABEL: test_mfma_f32_4x4x1f32: ; GREEDY90A-GISEL: ; %bb.0: ; %bb ; GREEDY90A-GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v1, 2.0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GREEDY90A-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a0, s0 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a1, s1 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a2, s2 ; GREEDY90A-GISEL-NEXT: v_accvgpr_write_b32 a3, s3 ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[2:5], v0, v1, a[0:3] ; GREEDY90A-GISEL-NEXT: s_nop 1 ; GREEDY90A-GISEL-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; GREEDY90A-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GREEDY90A-GISEL-NEXT: s_nop 3 ; GREEDY90A-GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GREEDY90A-GISEL-NEXT: s_endpgm ; ; FAST90A-LABEL: test_mfma_f32_4x4x1f32: ; FAST90A: ; %bb.0: ; %bb ; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0 ; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0 ; FAST90A-NEXT: v_mov_b32_e32 v0, 0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; FAST90A-NEXT: s_waitcnt lgkmcnt(0) ; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4 ; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5 ; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6 ; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] ; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v1, v2, a[0:3] ; FAST90A-NEXT: s_nop 4 ; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a4 ; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a5 ; FAST90A-NEXT: s_nop 1 ; FAST90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] ; FAST90A-NEXT: s_nop 4 ; FAST90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; FAST90A-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) %tmp.1 = shufflevector <4 x float> %mai.1, <4 x float> %mai.2, <4 x i32> %mai.3 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %tmp.1, i32 0, i32 0, i32 0) store <4 x float> %mai.3, ptr addrspace(1) %arg ret void } attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }