diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll | 1074 |
1 files changed, 1047 insertions, 27 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll index 6763957..f7aaa3e 100644 --- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll +++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll @@ -1,15 +1,148 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vgpr: -; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) #0 { +; GFX908-LABEL: test_mfma_f32_32x32x1f32_vgpr: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, s16 +; GFX908-NEXT: v_mov_b32_e32 v1, s17 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s21 +; GFX908-NEXT: v_mov_b32_e32 v1, s22 +; GFX908-NEXT: v_mov_b32_e32 v2, s23 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s24 +; GFX908-NEXT: v_mov_b32_e32 v1, s25 +; GFX908-NEXT: v_mov_b32_e32 v2, s26 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s27 +; GFX908-NEXT: v_mov_b32_e32 v1, s28 +; GFX908-NEXT: v_mov_b32_e32 v2, s29 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s30 +; GFX908-NEXT: v_mov_b32_e32 v1, s31 +; GFX908-NEXT: v_mov_b32_e32 v2, s0 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s1 +; GFX908-NEXT: v_mov_b32_e32 v1, s2 +; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: v_mov_b32_e32 v1, s5 +; GFX908-NEXT: v_mov_b32_e32 v2, s6 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s7 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: v_mov_b32_e32 v3, s19 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v1, s11 +; GFX908-NEXT: v_mov_b32_e32 v2, s12 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s13 +; GFX908-NEXT: v_mov_b32_e32 v1, s14 +; GFX908-NEXT: v_mov_b32_e32 v2, s15 +; GFX908-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; GFX908-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -17,9 +150,142 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_agpr: -; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 { +; GFX908-LABEL: test_mfma_f32_32x32x1f32_agpr: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0 +; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v0, s16 +; GFX908-NEXT: v_mov_b32_e32 v1, s17 +; GFX908-NEXT: v_mov_b32_e32 v2, s18 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s21 +; GFX908-NEXT: v_mov_b32_e32 v1, s22 +; GFX908-NEXT: v_mov_b32_e32 v2, s23 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s24 +; GFX908-NEXT: v_mov_b32_e32 v1, s25 +; GFX908-NEXT: v_mov_b32_e32 v2, s26 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s27 +; GFX908-NEXT: v_mov_b32_e32 v1, s28 +; GFX908-NEXT: v_mov_b32_e32 v2, s29 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s30 +; GFX908-NEXT: v_mov_b32_e32 v1, s31 +; GFX908-NEXT: v_mov_b32_e32 v2, s0 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s1 +; GFX908-NEXT: v_mov_b32_e32 v1, s2 +; GFX908-NEXT: v_mov_b32_e32 v2, s3 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GFX908-NEXT: v_mov_b32_e32 v1, s5 +; GFX908-NEXT: v_mov_b32_e32 v2, s6 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s7 +; GFX908-NEXT: v_mov_b32_e32 v1, s8 +; GFX908-NEXT: v_mov_b32_e32 v2, s9 +; GFX908-NEXT: v_mov_b32_e32 v3, s19 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s10 +; GFX908-NEXT: v_mov_b32_e32 v1, s11 +; GFX908-NEXT: v_mov_b32_e32 v2, s12 +; GFX908-NEXT: v_mov_b32_e32 v5, s20 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, s13 +; GFX908-NEXT: v_mov_b32_e32 v1, s14 +; GFX908-NEXT: v_mov_b32_e32 v2, s15 +; GFX908-NEXT: v_mov_b32_e32 v3, 1.0 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v2 +; GFX908-NEXT: v_mov_b32_e32 v0, 2.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31] +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a28 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a16 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a20 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a8 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a12 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16 +; GFX908-NEXT: s_endpgm bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -27,9 +293,105 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: -; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) { +; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def a0 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_endpgm bb: %acc = call i32 asm sideeffect "; def $0", "={a0}"() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -38,9 +400,105 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: -; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) { +; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use a[100:131] +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_endpgm bb: call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison) %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -49,10 +507,105 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_no_agprs: -; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addrspace(1) %arg) #0 { +; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX908-NEXT: v_mov_b32_e32 v32, 0 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def v0 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 +; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 +; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 +; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 +; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 +; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112 +; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64 +; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80 +; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32 +; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48 +; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] +; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16 +; GFX908-NEXT: s_endpgm bb: %acc = call i32 asm sideeffect "; def $0", "={v0}"() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -61,9 +614,127 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call: -; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 { +; GFX908-LABEL: test_mfma_f32_32x32x1f32_call: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 +; GFX908-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 +; GFX908-NEXT: s_mov_b32 s38, -1 +; GFX908-NEXT: s_mov_b32 s39, 0xe00000 +; GFX908-NEXT: s_add_u32 s36, s36, s11 +; GFX908-NEXT: s_addc_u32 s37, s37, 0 +; GFX908-NEXT: s_mov_b32 s12, s8 +; GFX908-NEXT: s_add_u32 s8, s4, 44 +; GFX908-NEXT: s_mov_b32 s13, s9 +; GFX908-NEXT: s_addc_u32 s9, s5, 0 +; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 +; GFX908-NEXT: s_getpc_b64 s[4:5] +; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX908-NEXT: s_mov_b32 s32, 0 +; GFX908-NEXT: v_mov_b32_e32 v40, 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX908-NEXT: global_load_dwordx4 v[28:31], v40, s[34:35] offset:112 +; GFX908-NEXT: global_load_dwordx4 v[24:27], v40, s[34:35] offset:96 +; GFX908-NEXT: global_load_dwordx4 v[20:23], v40, s[34:35] offset:80 +; GFX908-NEXT: global_load_dwordx4 v[16:19], v40, s[34:35] offset:64 +; GFX908-NEXT: global_load_dwordx4 v[12:15], v40, s[34:35] offset:48 +; GFX908-NEXT: global_load_dwordx4 v[8:11], v40, s[34:35] offset:32 +; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16 +; GFX908-NEXT: global_load_dwordx4 v[0:3], v40, s[34:35] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v6 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v7 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v8 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v9 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v10 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v11 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v12 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v13 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v14 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v15 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v16 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v17 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v18 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v19 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v20 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v21 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v22 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v23 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v24 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v25 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v26 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v27 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v28 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v29 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v30 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v31 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX908-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31] +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a0 +; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112 +; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64 +; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80 +; GFX908-NEXT: global_store_dwordx4 v40, v[16:19], s[34:35] offset:32 +; GFX908-NEXT: global_store_dwordx4 v40, v[20:23], s[34:35] offset:48 +; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35] +; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16 +; GFX908-NEXT: s_endpgm bb: call void @foo() %in.1 = load <32 x float>, ptr addrspace(1) %arg @@ -75,10 +746,173 @@ bb: ; We could avoid scan to find calls since we see these during lowering before selection. ; However, in SDag lowering and selection is done block by block, so it would only work ; in Global ISel. - -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call_multi_bb: -; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 { +; GFX908-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb: +; GFX908: ; %bb.0: ; %bb1 +; GFX908-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 +; GFX908-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 +; GFX908-NEXT: s_mov_b32 s54, -1 +; GFX908-NEXT: s_mov_b32 s55, 0xe00000 +; GFX908-NEXT: s_add_u32 s52, s52, s11 +; GFX908-NEXT: s_mov_b32 s14, s10 +; GFX908-NEXT: s_mov_b32 s12, s8 +; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GFX908-NEXT: s_load_dword s8, s[4:5], 0x2c +; GFX908-NEXT: v_mov_b32_e32 v6, 1.0 +; GFX908-NEXT: v_mov_b32_e32 v7, 0 +; GFX908-NEXT: s_addc_u32 s53, s53, 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 +; GFX908-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40 +; GFX908-NEXT: s_bitcmp0_b32 s8, 0 +; GFX908-NEXT: s_mov_b32 s32, 0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v3, s36 +; GFX908-NEXT: v_mov_b32_e32 v4, s37 +; GFX908-NEXT: v_mov_b32_e32 v5, s40 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v4 +; GFX908-NEXT: v_mov_b32_e32 v3, s38 +; GFX908-NEXT: v_mov_b32_e32 v4, s39 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v4 +; GFX908-NEXT: v_mov_b32_e32 v3, s41 +; GFX908-NEXT: v_mov_b32_e32 v4, s42 +; GFX908-NEXT: v_mov_b32_e32 v5, s43 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v5 +; GFX908-NEXT: v_mov_b32_e32 v3, s44 +; GFX908-NEXT: v_mov_b32_e32 v4, s45 +; GFX908-NEXT: v_mov_b32_e32 v5, s46 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v5 +; GFX908-NEXT: v_mov_b32_e32 v3, s47 +; GFX908-NEXT: v_mov_b32_e32 v4, s48 +; GFX908-NEXT: v_mov_b32_e32 v5, s49 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v5 +; GFX908-NEXT: v_mov_b32_e32 v3, s50 +; GFX908-NEXT: v_mov_b32_e32 v4, s51 +; GFX908-NEXT: v_mov_b32_e32 v5, s16 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v5 +; GFX908-NEXT: v_mov_b32_e32 v3, s17 +; GFX908-NEXT: v_mov_b32_e32 v4, s18 +; GFX908-NEXT: v_mov_b32_e32 v5, s19 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v5 +; GFX908-NEXT: v_mov_b32_e32 v3, s20 +; GFX908-NEXT: v_mov_b32_e32 v4, s21 +; GFX908-NEXT: v_mov_b32_e32 v5, s22 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v5 +; GFX908-NEXT: v_mov_b32_e32 v3, s23 +; GFX908-NEXT: v_mov_b32_e32 v4, s24 +; GFX908-NEXT: v_mov_b32_e32 v5, s25 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v5 +; GFX908-NEXT: v_mov_b32_e32 v3, s26 +; GFX908-NEXT: v_mov_b32_e32 v4, s27 +; GFX908-NEXT: v_mov_b32_e32 v5, s28 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v5 +; GFX908-NEXT: v_mov_b32_e32 v3, s29 +; GFX908-NEXT: v_mov_b32_e32 v4, s30 +; GFX908-NEXT: v_mov_b32_e32 v5, s31 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v5 +; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a24 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a28 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a16 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a20 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a8 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a12 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a4 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16 +; GFX908-NEXT: s_cbranch_scc1 .LBB6_2 +; GFX908-NEXT: ; %bb.1: ; %bb2 +; GFX908-NEXT: s_add_u32 s8, s4, 48 +; GFX908-NEXT: s_mov_b32 s13, s9 +; GFX908-NEXT: s_addc_u32 s9, s5, 0 +; GFX908-NEXT: s_getpc_b64 s[4:5] +; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 +; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 +; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0 +; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1] +; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX908-NEXT: s_mov_b64 s[0:1], s[52:53] +; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX908-NEXT: s_mov_b64 s[2:3], s[54:55] +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX908-NEXT: .LBB6_2: ; %bb3 +; GFX908-NEXT: s_endpgm bb1: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3) @@ -94,10 +928,101 @@ bb3: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_noagpr: -; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] -; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}] define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 { +; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 +; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 +; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 +; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 +; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 +; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 +; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v7 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v8 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v9 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v11 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v12 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v13 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v15 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v16 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v17 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v19 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v20 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v21 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v23 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v24 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v25 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v27 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v28 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v29 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v31 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v32 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v33 +; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -105,9 +1030,101 @@ bb: ret void } -; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_with_agpr: -; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}] define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) #3 { +; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112 +; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96 +; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80 +; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64 +; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48 +; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32 +; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_write_b32 a0, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v6 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v7 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v8 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v9 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v10 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v11 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v12 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v13 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v14 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v15 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v16 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v17 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v18 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v19 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v20 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v21 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v22 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v23 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v24 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v25 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v26 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v27 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v28 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v29 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v30 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v31 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v32 +; GFX908-NEXT: v_accvgpr_write_b32 a31, v33 +; GFX908-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX908-NEXT: v_mov_b32_e32 v3, 2.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a27 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a26 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a25 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a24 +; GFX908-NEXT: v_accvgpr_read_b32 v9, a31 +; GFX908-NEXT: v_accvgpr_read_b32 v8, a30 +; GFX908-NEXT: v_accvgpr_read_b32 v7, a29 +; GFX908-NEXT: v_accvgpr_read_b32 v6, a28 +; GFX908-NEXT: v_accvgpr_read_b32 v13, a19 +; GFX908-NEXT: v_accvgpr_read_b32 v12, a18 +; GFX908-NEXT: v_accvgpr_read_b32 v11, a17 +; GFX908-NEXT: v_accvgpr_read_b32 v10, a16 +; GFX908-NEXT: v_accvgpr_read_b32 v17, a23 +; GFX908-NEXT: v_accvgpr_read_b32 v16, a22 +; GFX908-NEXT: v_accvgpr_read_b32 v15, a21 +; GFX908-NEXT: v_accvgpr_read_b32 v14, a20 +; GFX908-NEXT: v_accvgpr_read_b32 v21, a11 +; GFX908-NEXT: v_accvgpr_read_b32 v20, a10 +; GFX908-NEXT: v_accvgpr_read_b32 v19, a9 +; GFX908-NEXT: v_accvgpr_read_b32 v18, a8 +; GFX908-NEXT: v_accvgpr_read_b32 v25, a15 +; GFX908-NEXT: v_accvgpr_read_b32 v24, a14 +; GFX908-NEXT: v_accvgpr_read_b32 v23, a13 +; GFX908-NEXT: v_accvgpr_read_b32 v22, a12 +; GFX908-NEXT: v_accvgpr_read_b32 v29, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v28, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v27, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v26, a0 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_accvgpr_read_b32 v5, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a6 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a4 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48 +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: s_setpc_b64 s[30:31] bb: %in.1 = load <32 x float>, ptr addrspace(1) %arg %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0) @@ -121,3 +1138,6 @@ attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2 attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" } attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX90A: {{.*}} |