; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float, float, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float, float, <4 x float>, i32, i32, i32) declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half>, <4 x half>, <32 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half>, <4 x half>, <16 x float>, i32, i32, i32) declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half>, <4 x half>, <4 x float>, i32, i32, i32) declare <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32, i32, <32 x i32>, i32, i32, i32) declare <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32, i32, <16 x i32>, i32, i32, i32) declare <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32, i32, <4 x i32>, i32, i32, i32) declare i32 @llvm.amdgcn.workitem.id.x() define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 ; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 ; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_16x16x1f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x1f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_4x4x1f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_4x4x1f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 3 ; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) store <4 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_32x32x2f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x2f32: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x2f32: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x2f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x2f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x2f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x2f32: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x2_f32 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x2f32(float 1.0, float 2.0, <16 x float> %in.1, i32 1, i32 2, i32 3) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_16x16x4f32(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x4f32: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_16x16x4f32: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_16x16x4f32: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x4f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x4f32: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_16x16x4_f32 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f32: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_f32 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x4f32(float 1.0, float 2.0, <4 x float> %in.1, i32 1, i32 2, i32 3) store <4 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_32x32x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x4f16: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s23 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s25 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s26 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s27 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s28 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s29 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s30 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s31 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s3 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s6 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s9 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s12 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v6, s21 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s15 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x4f16: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s23 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s25 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s26 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s27 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s28 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s29 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s30 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s31 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s3 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s6 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s9 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s12 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s20 ; LIT-SRCC-NEXT: v_mov_b32_e32 v6, s21 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s15 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[0:1], a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:96 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:112 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:64 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:80 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:32 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:48 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[36:37] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x4f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s0 ; GFX90A-NEXT: v_mov_b32_e32 v3, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX90A-NEXT: v_mov_b32_e32 v4, s2 ; GFX90A-NEXT: v_mov_b32_e32 v5, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x4f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37] ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x4f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s0 ; GFX942-NEXT: v_mov_b32_e32 v3, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 ; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX942-NEXT: v_mov_b32_e32 v4, s2 ; GFX942-NEXT: v_mov_b32_e32 v5, s3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x4_2b_f16 a[0:31], v[2:3], v[4:5], a[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[36:37] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[36:37] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[36:37] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[36:37] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[36:37] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[36:37] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[36:37] ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[36:37] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x4f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[36:39], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[36:37], 0x40 ; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[36:37], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[38:39], 0x0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, s0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v35, s1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v36, s2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v37, s3 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x4_2b_f16 v[0:31], v[34:35], v[36:37], v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 2 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[36:37] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[36:37] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[36:37] offset:64 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[36:37] offset:80 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[36:37] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[36:37] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[36:37] ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[36:37] offset:16 ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 %c.2 = load <4 x half>, ptr addrspace(1) %c2p %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x4f16(<4 x half> %c.1, <4 x half> %c.2, <32 x float> %in.1, i32 1, i32 2, i32 3) store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_16x16x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x4f16: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s20 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s21 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s22 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_16x16x4f16: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v12, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s20 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s12 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s21 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, s15 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s22 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_16x16x4f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: v_mov_b32_e32 v1, s21 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s22 ; GFX90A-NEXT: v_mov_b32_e32 v3, s23 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x4f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x4f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s20 ; GFX942-NEXT: v_mov_b32_e32 v1, s21 ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, s22 ; GFX942-NEXT: v_mov_b32_e32 v3, s23 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_16x16x4_4b_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x4f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x4_4b_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 %c.2 = load <4 x half>, ptr addrspace(1) %c2p %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x4f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_4x4x4f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x4f16: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v6, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v7, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s11 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v7 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_4x4x4f16: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v6, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v7, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s11 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v7 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_4x4x4f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: v_mov_b32_e32 v3, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_mov_b32_e32 v4, s6 ; GFX90A-NEXT: v_mov_b32_e32 v5, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_4x4x4f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x4f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s4 ; GFX942-NEXT: v_mov_b32_e32 v3, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_mov_b32_e32 v4, s6 ; GFX942-NEXT: v_mov_b32_e32 v5, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_4x4x4_16b_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 4 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x4f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x4_16b_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 %c.2 = load <4 x half>, ptr addrspace(1) %c2p %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x4f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) store <4 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_32x32x8f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x8f16: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s20 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s21 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s22 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x8f16: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s20 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s21 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s22 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s23 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x8f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s20 ; GFX90A-NEXT: v_mov_b32_e32 v1, s21 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v2, s22 ; GFX90A-NEXT: v_mov_b32_e32 v3, s23 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x8f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v0, s20 ; GFX942-NEXT: v_mov_b32_e32 v1, s21 ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, s22 ; GFX942-NEXT: v_mov_b32_e32 v3, s23 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[20:23], s[18:19], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s21 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s23 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 %c.2 = load <4 x half>, ptr addrspace(1) %c2p %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> %c.1, <4 x half> %c.2, <16 x float> %in.1, i32 1, i32 2, i32 3) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_16x16x16f16(ptr addrspace(1) %arg, ptr addrspace(1) %c) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x16f16: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v6, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v7, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s11 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v7 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_16x16x16f16: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v6, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v7, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s11 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v7 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_16x16x16f16: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: v_mov_b32_e32 v3, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX90A-NEXT: v_mov_b32_e32 v4, s6 ; GFX90A-NEXT: v_mov_b32_e32 v5, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x16f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x16f16: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v2, s4 ; GFX942-NEXT: v_mov_b32_e32 v3, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a0, s8 ; GFX942-NEXT: v_mov_b32_e32 v4, s6 ; GFX942-NEXT: v_mov_b32_e32 v5, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s11 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_16x16x16_f16 a[0:3], v[2:3], v[4:5], a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 6 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x16f16: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s5 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s7 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x16_f16 v[0:3], v[6:7], v[8:9], v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 6 ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %c.1 = load <4 x half>, ptr addrspace(1) %c %c2p = getelementptr <4 x half>, ptr addrspace(1) %c, i64 1 %c.2 = load <4 x half>, ptr addrspace(1) %c2p %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> %c.1, <4 x half> %c.2, <4 x float> %in.1, i32 1, i32 2, i32 3) store <4 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_i32_32x32x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_32x32x4i8: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s21 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s22 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s23 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v3, v0, a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a24 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:96 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a28 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a3 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:112 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a19 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a16 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a0 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:64 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a13 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:80 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[34:35] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[34:35] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[34:35] ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_32x32x4i8: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_i32_32x32x4i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_32x32x4i8: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 ; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 ; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_i32_32x32x4_2b_i8 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_32x32x4i8: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_i32_32x32x4_2b_i8 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 2 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <32 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.i32.32x32x4i8(i32 1, i32 2, <32 x i32> %in.1, i32 1, i32 2, i32 3) store <32 x i32> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_i32_16x16x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_i32_16x16x4i8: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_16x16x4i8: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_16x16x4i8: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 1 ; GFX942-NEXT: v_mov_b32_e32 v1, 2 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, v[0:15] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3) store <16 x i32> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 64 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 64 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a8 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v0, 1 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_i32_16x16x4i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 1 ; GFX942-NEXT: v_mov_b32_e32 v1, 2 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_i32_16x16x4_4b_i8 a[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.16x16x4i8(i32 1, i32 2, <16 x i32> splat (i32 64), i32 1, i32 2, i32 3) store <16 x i32> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_i32_4x4x4i8(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_4x4x4i8: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_4x4x4i8: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 1 ; GFX942-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 4 ; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v4, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 4 ; GFX942-VGPR-NEXT: global_store_dwordx4 v5, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3) store <4 x i32> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_imm_src2_1(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v0, 1 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 1 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v0, v2, 1 cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_imm_src2_1: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v0, v1, 1 cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 3 ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 1), i32 1, i32 2, i32 3) store <4 x i32> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_i32_4x4x4i8_splat_k_src2_1(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; NOLIT-SRCC: ; %bb.0: ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; LIT-SRCC: ; %bb.0: ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x41 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v0, a[0:3] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x41 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_i32_4x4x4i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; GFX942: ; %bb.0: ; GFX942-NEXT: v_mov_b32_e32 v1, 0x41 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX942-NEXT: v_mov_b32_e32 v1, 1 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX942-NEXT: v_mov_b32_e32 v2, 2 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_i32_4x4x4_16b_i8 a[0:3], v1, v2, a[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_i32_4x4x4i8_splat_k_src2_1: ; GFX942-VGPR: ; %bb.0: ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x41 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: v_mfma_i32_4x4x4_16b_i8 v[0:3], v5, v6, v[0:3] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 3 ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm %in.1 = load <4 x i32>, ptr addrspace(1) %arg %mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.4x4x4i8(i32 1, i32 2, <4 x i32> splat (i32 65), i32 1, i32 2, i32 3) store <4 x i32> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_32x32x1f32_forward_acc(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s23 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_forward_acc: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s16 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s22 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s23 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s25 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s26 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s27 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s28 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s29 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s30 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s31 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s9 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s19 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s12 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, s20 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s21 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s15 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v2 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_forward_acc: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s16 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s17 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s22 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s23 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s24 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s25 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s26 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s27 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s28 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s29 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s30 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s31 ; GFX90A-NEXT: v_accvgpr_write_b32 a16, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a18, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a19, s3 ; GFX90A-NEXT: v_accvgpr_write_b32 a20, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a21, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a22, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a23, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a24, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a25, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a26, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a27, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a28, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32_forward_acc: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s16 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s17 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s18 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s19 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s20 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s21 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s22 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s23 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s24 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s25 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s26 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s27 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s28 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s29 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s30 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s31 ; GFX942-NEXT: v_accvgpr_write_b32 a16, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a17, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a18, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a19, s3 ; GFX942-NEXT: v_accvgpr_write_b32 a20, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a21, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a22, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a23, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a24, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a25, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a26, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a27, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a28, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a29, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a30, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a31, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[34:35] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[34:35] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[34:35] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[34:35] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[34:35] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[34:35] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[34:35] ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[34:35] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_forward_acc: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 2.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, s16 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, s17 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, s18 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, s19 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, s20 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, s21 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, s22 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, s23 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, s24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, s25 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, s26 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, s27 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, s28 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, s29 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, s30 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, s31 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, s0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, s1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, s2 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, s3 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, s4 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, s5 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, s6 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, s7 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, s8 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, s9 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, s10 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, s11 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, s12 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, s13 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, s14 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, s15 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, v[0:31] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[34:35] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[34:35] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[34:35] offset:64 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[34:35] offset:80 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[34:35] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[34:35] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[34:35] ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[34:35] offset:16 ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) %mai.2 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %mai.1, i32 0, i32 0, i32 0) store <32 x float> %mai.2, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_16x16x1f32_forward_acc(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v16, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s4 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s7 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s8 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s10 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s11 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s12 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, s13 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s14 ; LIT-SRCC-NEXT: v_mov_b32_e32 v17, s15 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v17 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a0 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:16 ; LIT-SRCC-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: v_accvgpr_write_b32 a4, s4 ; GFX942-NEXT: v_accvgpr_write_b32 a5, s5 ; GFX942-NEXT: v_accvgpr_write_b32 a6, s6 ; GFX942-NEXT: v_accvgpr_write_b32 a7, s7 ; GFX942-NEXT: v_accvgpr_write_b32 a8, s8 ; GFX942-NEXT: v_accvgpr_write_b32 a9, s9 ; GFX942-NEXT: v_accvgpr_write_b32 a10, s10 ; GFX942-NEXT: v_accvgpr_write_b32 a11, s11 ; GFX942-NEXT: v_accvgpr_write_b32 a12, s12 ; GFX942-NEXT: v_accvgpr_write_b32 a13, s13 ; GFX942-NEXT: v_accvgpr_write_b32 a14, s14 ; GFX942-NEXT: v_accvgpr_write_b32 a15, s15 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15] ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_forward_acc: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], s[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], s[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], s[12:13] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, v[0:15] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <16 x float>, ptr addrspace(1) %arg %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %in.1, i32 0, i32 0, i32 0) %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) store <16 x float> %mai.2, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_4x4x1f32_forward_acc(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_forward_acc: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_forward_acc: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, s1 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, s2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v5 ; LIT-SRCC-NEXT: v_mov_b32_e32 v5, s3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v5 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_4x4x1f32_forward_acc: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; GFX90A-NEXT: s_nop 4 ; GFX90A-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_forward_acc: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_accvgpr_write_b32 a0, s0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, s1 ; GFX942-NEXT: v_accvgpr_write_b32 a2, s2 ; GFX942-NEXT: v_accvgpr_write_b32 a3, s3 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v1, a[0:3] ; GFX942-NEXT: s_nop 3 ; GFX942-NEXT: global_store_dwordx4 v2, a[0:3], s[6:7] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_forward_acc: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v4, v5, v[0:3] ; GFX942-VGPR-NEXT: s_nop 3 ; GFX942-VGPR-NEXT: global_store_dwordx4 v6, v[0:3], s[6:7] ; GFX942-VGPR-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %in.1, i32 0, i32 0, i32 0) %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) store <4 x float> %mai.2, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm_splat(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm_splat: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm_splat: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, 1.0 ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_4x4x1f32_imm_splat: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v2, 1.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 ; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_imm_splat: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v0, v2, 1.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v1, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm_splat: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, 1.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) store <4 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_imm_splat: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_imm_splat: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v8, 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:32 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_16x16x1f32_imm_splat: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x1f32_imm_splat: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> , i32 0, i32 0, i32 0) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x8f16_imm_splat: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v2, 0x40004000 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v3, v2 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], a[0:15] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x8f16_imm_splat: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, v0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v2, 0x40004000 ; LIT-SRCC-NEXT: v_mov_b32_e32 v3, v2 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v13, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a4 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a3 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[8:11], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v13, v[9:12], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x8f16_imm_splat: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40004000 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x8f16 a[0:15], v[0:1], v[2:3], 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x8f16_imm_splat: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GFX942-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-NEXT: v_mov_b32_e32 v2, 0x40004000 ; GFX942-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x8_f16 a[0:15], v[0:1], v[2:3], 1.0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0x40004000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v2 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> , <4 x half> , <16 x float> , i32 0, i32 0, i32 0) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm_splat: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, 0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm_splat: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v14, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a24 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a23 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a20 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:112 ; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] offset:96 ; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[8:11], s[0:1] offset:80 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a19 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a18 ; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a16 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[10:13], s[0:1] offset:64 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v14, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm_splat: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm_splat: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_4x4x1f32_imm(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 2.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_imm: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 2.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_4x4x1f32_imm: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_imm: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 2.0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_imm: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v0, v1, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) store <4 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_imm: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 2.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_16x16x1f32_imm: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 2.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15] ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_16x16x1f32_imm: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a15, 2.0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_16x16x1f32_imm: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_accvgpr_write_b32 a15, 2.0 ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v1, v2, a[0:15] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0 ; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v15, v[0:15] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 2.0, <16 x float> , i32 0, i32 0, i32 0) store <16 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, 0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, 0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a28 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a23 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a20 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a19 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a16 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a12 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a8 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_imm: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a1 ; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 1 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32_imm: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a0, 1.0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a4, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a5, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a6, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a7, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a8, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a9, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a10, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a11, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a12, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a13, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a14, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a15, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a16, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a17, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a18, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a19, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a20, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a21, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a22, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a23, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a24, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a25, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a26, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a27, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a28, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a29, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a30, a1 ; GFX942-NEXT: v_accvgpr_mov_b32 a31, a1 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v7, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v8, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v9, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v10, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v11, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v12, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v13, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v14, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v15, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v20, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v21, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v22, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v23, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v24, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v25, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v26, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v27, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v28, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v29, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v30, v1 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v31, v1 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[32:33], v[30:31] ; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[30:31], v[28:29] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[28:29], v[26:27] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[26:27], v[24:25] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[24:25], v[22:23] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[22:23], v[20:21] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[20:21], v[18:19] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[18:19], v[16:17] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[16:17], v[14:15] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[14:15], v[12:13] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[12:13], v[10:11] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[10:11], v[8:9] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[8:9], v[6:7] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[6:7], v[4:5] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[4:5], v[2:3] ; GFX942-VGPR-NEXT: v_mov_b64_e32 v[2:3], v[0:1] ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[2:33], v0, v34, v[2:33] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[30:33], s[0:1] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[26:29], s[0:1] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[22:25], s[0:1] offset:80 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[18:21], s[0:1] offset:64 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[14:17], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[10:13], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[6:9], s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_store_dwordx4 v1, v[2:5], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> , i32 0, i32 0, i32 0) store <32 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat(ptr addrspace(1) %arg, i64 %idx) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; NOLIT-SRCC-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v1 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; LIT-SRCC-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v1 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v1, a[0:3] ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v4, 4, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) ;store <4 x float> %mai.1, ptr addrspace(1) %arg store <4 x float> %mai.1, ptr addrspace(1) %gep ret void } define amdgpu_kernel void @test_mfma_f32_4x4x1f32_lit_splat_bad_code(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 1.0 ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] ; NOLIT-SRCC-NEXT: s_nop 3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 1.0 ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 2.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v4, 0 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v0, a[0:3] ; LIT-SRCC-NEXT: s_nop 3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a3 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; GFX90A-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: s_nop 0 ; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v1, v2, a[0:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_nop 3 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: v_mov_b32_e32 v1, 0x42f60000 ; GFX942-NEXT: v_accvgpr_write_b32 a0, v1 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0 ; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_nop 0 ; GFX942-NEXT: v_mfma_f32_4x4x1_16b_f32 a[0:3], v1, v2, a[0:3] ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_nop 2 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_4x4x1f32_lit_splat_bad_code: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x42f60000 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v6, 2.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v4, 0 ; GFX942-VGPR-NEXT: s_nop 0 ; GFX942-VGPR-NEXT: v_mfma_f32_4x4x1_16b_f32 v[0:3], v5, v6, v[0:3] ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: s_nop 2 ; GFX942-VGPR-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %arg, i32 %tid %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 2.0, <4 x float> , i32 0, i32 0, i32 0) store <4 x float> %mai.1, ptr addrspace(1) %arg ret void } define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vecarg(ptr addrspace(1) %arg) #0 { ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_vecarg: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; NOLIT-SRCC-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; NOLIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; NOLIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; NOLIT-SRCC-NEXT: s_waitcnt vmcnt(0) ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v7 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v9 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v10 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v11 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v13 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v14 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v15 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v17 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v18 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v19 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v21 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v22 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v23 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v25 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v26 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v27 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v29 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v30 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v31 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 7 ; NOLIT-SRCC-NEXT: s_nop 1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; NOLIT-SRCC-NEXT: s_nop 0 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; NOLIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] ; NOLIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 ; NOLIT-SRCC-NEXT: s_endpgm ; ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_vecarg: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; LIT-SRCC-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; LIT-SRCC-NEXT: s_waitcnt lgkmcnt(0) ; LIT-SRCC-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 ; LIT-SRCC-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; LIT-SRCC-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; LIT-SRCC-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 ; LIT-SRCC-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 ; LIT-SRCC-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; LIT-SRCC-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; LIT-SRCC-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; LIT-SRCC-NEXT: s_waitcnt vmcnt(0) ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, v0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, v1 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, v2 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, v3 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, v4 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a5, v5 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a6, v6 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a7, v7 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a8, v8 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a9, v9 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a10, v10 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a11, v11 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a12, v12 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a13, v13 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a14, v14 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a15, v15 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a16, v16 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a17, v17 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a18, v18 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a19, v19 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a20, v20 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a21, v21 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a22, v22 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a23, v23 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a24, v24 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a25, v25 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a26, v26 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a27, v27 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a28, v28 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a29, v29 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a30, v30 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a31, v31 ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 ; LIT-SRCC-NEXT: v_mov_b32_e32 v1, 2.0 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] cbsz:1 abid:2 blgp:3 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 7 ; LIT-SRCC-NEXT: s_nop 1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a27 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a26 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a25 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a24 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v7, a31 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v6, a30 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v5, a29 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v4, a28 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v11, a19 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v10, a18 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v9, a17 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v8, a16 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v15, a23 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v14, a22 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v13, a21 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v12, a20 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v19, a11 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v18, a10 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v17, a9 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v16, a8 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v23, a15 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v22, a14 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v21, a13 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v20, a12 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v27, a3 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v26, a2 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v25, a1 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v24, a0 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 ; LIT-SRCC-NEXT: s_nop 0 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v3, a7 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v2, a6 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v1, a5 ; LIT-SRCC-NEXT: v_accvgpr_read_b32 v0, a4 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] ; LIT-SRCC-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 ; LIT-SRCC-NEXT: s_endpgm ; ; GFX90A-LABEL: test_mfma_f32_32x32x1f32_vecarg: ; GFX90A: ; %bb.0: ; %bb ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 ; GFX90A-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 ; GFX90A-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 ; GFX90A-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 ; GFX90A-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 ; GFX90A-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 ; GFX90A-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 ; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 7 ; GFX90A-NEXT: s_nop 2 ; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX90A-NEXT: s_endpgm ; ; GFX942-LABEL: test_mfma_f32_32x32x1f32_vecarg: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 1.0 ; GFX942-NEXT: v_mov_b32_e32 v2, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: global_load_dwordx4 a[28:31], v0, s[0:1] offset:112 ; GFX942-NEXT: global_load_dwordx4 a[24:27], v0, s[0:1] offset:96 ; GFX942-NEXT: global_load_dwordx4 a[20:23], v0, s[0:1] offset:80 ; GFX942-NEXT: global_load_dwordx4 a[16:19], v0, s[0:1] offset:64 ; GFX942-NEXT: global_load_dwordx4 a[12:15], v0, s[0:1] offset:48 ; GFX942-NEXT: global_load_dwordx4 a[8:11], v0, s[0:1] offset:32 ; GFX942-NEXT: global_load_dwordx4 a[4:7], v0, s[0:1] offset:16 ; GFX942-NEXT: global_load_dwordx4 a[0:3], v0, s[0:1] ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v2, a[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 7 ; GFX942-NEXT: s_nop 1 ; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96 ; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112 ; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64 ; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80 ; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GFX942-NEXT: s_endpgm ; ; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_vecarg: ; GFX942-VGPR: ; %bb.0: ; %bb ; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-VGPR-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-VGPR-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 1.0 ; GFX942-VGPR-NEXT: v_mov_b32_e32 v34, 2.0 ; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-VGPR-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 ; GFX942-VGPR-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX942-VGPR-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX942-VGPR-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 ; GFX942-VGPR-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 ; GFX942-VGPR-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] ; GFX942-VGPR-NEXT: s_waitcnt vmcnt(0) ; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v33, v34, v[0:31] cbsz:1 abid:2 blgp:3 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 7 ; GFX942-VGPR-NEXT: s_nop 1 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48 ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] ; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16 ; GFX942-VGPR-NEXT: s_endpgm bb: %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %arg, i32 %tid %in.1 = load <32 x float>, ptr addrspace(1) %gep %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) store <32 x float> %mai.1, ptr addrspace(1) %gep ret void } attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} ; GFX908: {{.*}} ; GFX908_A: {{.*}} ; GFX90A_42: {{.*}}