aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll1074
1 files changed, 1047 insertions, 27 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
index 6763957..f7aaa3e 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-cd-select.ll
@@ -1,15 +1,148 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --enable-var-scope --check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX90A %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vgpr:
-; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_vgpr(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_vgpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s16
+; GFX908-NEXT: v_mov_b32_e32 v1, s17
+; GFX908-NEXT: v_mov_b32_e32 v2, s18
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s21
+; GFX908-NEXT: v_mov_b32_e32 v1, s22
+; GFX908-NEXT: v_mov_b32_e32 v2, s23
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s24
+; GFX908-NEXT: v_mov_b32_e32 v1, s25
+; GFX908-NEXT: v_mov_b32_e32 v2, s26
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s27
+; GFX908-NEXT: v_mov_b32_e32 v1, s28
+; GFX908-NEXT: v_mov_b32_e32 v2, s29
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s30
+; GFX908-NEXT: v_mov_b32_e32 v1, s31
+; GFX908-NEXT: v_mov_b32_e32 v2, s0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s1
+; GFX908-NEXT: v_mov_b32_e32 v1, s2
+; GFX908-NEXT: v_mov_b32_e32 v2, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: v_mov_b32_e32 v1, s5
+; GFX908-NEXT: v_mov_b32_e32 v2, s6
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s7
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
+; GFX908-NEXT: v_mov_b32_e32 v3, s19
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s10
+; GFX908-NEXT: v_mov_b32_e32 v1, s11
+; GFX908-NEXT: v_mov_b32_e32 v2, s12
+; GFX908-NEXT: v_mov_b32_e32 v5, s20
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v1, s14
+; GFX908-NEXT: v_mov_b32_e32 v2, s15
+; GFX908-NEXT: v_mov_b32_e32 v3, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; GFX908-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -17,9 +150,142 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_agpr(ptr addrspace(1) %arg) #2 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_agpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x0
+; GFX908-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x40
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s16
+; GFX908-NEXT: v_mov_b32_e32 v1, s17
+; GFX908-NEXT: v_mov_b32_e32 v2, s18
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s21
+; GFX908-NEXT: v_mov_b32_e32 v1, s22
+; GFX908-NEXT: v_mov_b32_e32 v2, s23
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s24
+; GFX908-NEXT: v_mov_b32_e32 v1, s25
+; GFX908-NEXT: v_mov_b32_e32 v2, s26
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s27
+; GFX908-NEXT: v_mov_b32_e32 v1, s28
+; GFX908-NEXT: v_mov_b32_e32 v2, s29
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s30
+; GFX908-NEXT: v_mov_b32_e32 v1, s31
+; GFX908-NEXT: v_mov_b32_e32 v2, s0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s1
+; GFX908-NEXT: v_mov_b32_e32 v1, s2
+; GFX908-NEXT: v_mov_b32_e32 v2, s3
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: v_mov_b32_e32 v1, s5
+; GFX908-NEXT: v_mov_b32_e32 v2, s6
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s7
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
+; GFX908-NEXT: v_mov_b32_e32 v3, s19
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s10
+; GFX908-NEXT: v_mov_b32_e32 v1, s11
+; GFX908-NEXT: v_mov_b32_e32 v2, s12
+; GFX908-NEXT: v_mov_b32_e32 v5, s20
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v1, s14
+; GFX908-NEXT: v_mov_b32_e32 v2, s15
+; GFX908-NEXT: v_mov_b32_e32 v3, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a28
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a16
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a20
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a8
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a12
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[34:35] offset:16
+; GFX908-NEXT: s_endpgm
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -27,9 +293,105 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr(ptr addrspace(1) %arg) {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_virtual_agpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def a0
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={a0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -38,9 +400,105 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_phys_agpr(ptr addrspace(1) %arg) {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_phys_agpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use a[100:131]
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_endpgm
bb:
call void asm sideeffect "; use $0", "{a[100:131]}"(<32 x float> poison)
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -49,10 +507,105 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
-; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_inline_asm_no_agprs(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_inline_asm_no_agprs:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def v0
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; GFX908-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1]
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; GFX908-NEXT: s_endpgm
bb:
%acc = call i32 asm sideeffect "; def $0", "={v0}"()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -61,9 +614,127 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call(ptr addrspace(1) %arg) #1 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_call:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s38, -1
+; GFX908-NEXT: s_mov_b32 s39, 0xe00000
+; GFX908-NEXT: s_add_u32 s36, s36, s11
+; GFX908-NEXT: s_addc_u32 s37, s37, 0
+; GFX908-NEXT: s_mov_b32 s12, s8
+; GFX908-NEXT: s_add_u32 s8, s4, 44
+; GFX908-NEXT: s_mov_b32 s13, s9
+; GFX908-NEXT: s_addc_u32 s9, s5, 0
+; GFX908-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX908-NEXT: s_getpc_b64 s[4:5]
+; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
+; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT: s_mov_b32 s14, s10
+; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX908-NEXT: s_mov_b32 s32, 0
+; GFX908-NEXT: v_mov_b32_e32 v40, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT: global_load_dwordx4 v[28:31], v40, s[34:35] offset:112
+; GFX908-NEXT: global_load_dwordx4 v[24:27], v40, s[34:35] offset:96
+; GFX908-NEXT: global_load_dwordx4 v[20:23], v40, s[34:35] offset:80
+; GFX908-NEXT: global_load_dwordx4 v[16:19], v40, s[34:35] offset:64
+; GFX908-NEXT: global_load_dwordx4 v[12:15], v40, s[34:35] offset:48
+; GFX908-NEXT: global_load_dwordx4 v[8:11], v40, s[34:35] offset:32
+; GFX908-NEXT: global_load_dwordx4 v[4:7], v40, s[34:35] offset:16
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v40, s[34:35]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v31
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a0
+; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: global_store_dwordx4 v40, v[4:7], s[34:35] offset:112
+; GFX908-NEXT: global_store_dwordx4 v40, v[8:11], s[34:35] offset:64
+; GFX908-NEXT: global_store_dwordx4 v40, v[12:15], s[34:35] offset:80
+; GFX908-NEXT: global_store_dwordx4 v40, v[16:19], s[34:35] offset:32
+; GFX908-NEXT: global_store_dwordx4 v40, v[20:23], s[34:35] offset:48
+; GFX908-NEXT: global_store_dwordx4 v40, v[24:27], s[34:35]
+; GFX908-NEXT: global_store_dwordx4 v40, v[0:3], s[34:35] offset:16
+; GFX908-NEXT: s_endpgm
bb:
call void @foo()
%in.1 = load <32 x float>, ptr addrspace(1) %arg
@@ -75,10 +746,173 @@ bb:
; We could avoid scan to find calls since we see these during lowering before selection.
; However, in SDag lowering and selection is done block by block, so it would only work
; in Global ISel.
-
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_call_multi_bb:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define amdgpu_kernel void @test_mfma_f32_32x32x1f32_call_multi_bb(ptr addrspace(1) %arg, i1 %c0) #1 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_call_multi_bb:
+; GFX908: ; %bb.0: ; %bb1
+; GFX908-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s54, -1
+; GFX908-NEXT: s_mov_b32 s55, 0xe00000
+; GFX908-NEXT: s_add_u32 s52, s52, s11
+; GFX908-NEXT: s_mov_b32 s14, s10
+; GFX908-NEXT: s_mov_b32 s12, s8
+; GFX908-NEXT: s_mov_b64 s[10:11], s[6:7]
+; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX908-NEXT: s_load_dword s8, s[4:5], 0x2c
+; GFX908-NEXT: v_mov_b32_e32 v6, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v7, 0
+; GFX908-NEXT: s_addc_u32 s53, s53, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
+; GFX908-NEXT: s_load_dwordx16 s[16:31], s[6:7], 0x40
+; GFX908-NEXT: s_bitcmp0_b32 s8, 0
+; GFX908-NEXT: s_mov_b32 s32, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, s36
+; GFX908-NEXT: v_mov_b32_e32 v4, s37
+; GFX908-NEXT: v_mov_b32_e32 v5, s40
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v4
+; GFX908-NEXT: v_mov_b32_e32 v3, s38
+; GFX908-NEXT: v_mov_b32_e32 v4, s39
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v4
+; GFX908-NEXT: v_mov_b32_e32 v3, s41
+; GFX908-NEXT: v_mov_b32_e32 v4, s42
+; GFX908-NEXT: v_mov_b32_e32 v5, s43
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s44
+; GFX908-NEXT: v_mov_b32_e32 v4, s45
+; GFX908-NEXT: v_mov_b32_e32 v5, s46
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s47
+; GFX908-NEXT: v_mov_b32_e32 v4, s48
+; GFX908-NEXT: v_mov_b32_e32 v5, s49
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s50
+; GFX908-NEXT: v_mov_b32_e32 v4, s51
+; GFX908-NEXT: v_mov_b32_e32 v5, s16
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s17
+; GFX908-NEXT: v_mov_b32_e32 v4, s18
+; GFX908-NEXT: v_mov_b32_e32 v5, s19
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s20
+; GFX908-NEXT: v_mov_b32_e32 v4, s21
+; GFX908-NEXT: v_mov_b32_e32 v5, s22
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s23
+; GFX908-NEXT: v_mov_b32_e32 v4, s24
+; GFX908-NEXT: v_mov_b32_e32 v5, s25
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s26
+; GFX908-NEXT: v_mov_b32_e32 v4, s27
+; GFX908-NEXT: v_mov_b32_e32 v5, s28
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, s29
+; GFX908-NEXT: v_mov_b32_e32 v4, s30
+; GFX908-NEXT: v_mov_b32_e32 v5, s31
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v5
+; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v6, v3, a[0:31] cbsz:1 abid:2 blgp:3
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a24
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a28
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:112
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a16
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:64
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a20
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:80
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a8
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:32
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a12
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:48
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a4
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v7, v[3:6], s[6:7] offset:16
+; GFX908-NEXT: s_cbranch_scc1 .LBB6_2
+; GFX908-NEXT: ; %bb.1: ; %bb2
+; GFX908-NEXT: s_add_u32 s8, s4, 48
+; GFX908-NEXT: s_mov_b32 s13, s9
+; GFX908-NEXT: s_addc_u32 s9, s5, 0
+; GFX908-NEXT: s_getpc_b64 s[4:5]
+; GFX908-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
+; GFX908-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
+; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX908-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX908-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GFX908-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX908-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX908-NEXT: .LBB6_2: ; %bb3
+; GFX908-NEXT: s_endpgm
bb1:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 1, i32 2, i32 3)
@@ -94,10 +928,101 @@ bb3:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_noagpr:
-; GFX908: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
-; GFX90A: v_mfma_f32_32x32x1{{.*}} v[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, v[{{[0-9:]+}}]
define void @test_mfma_f32_32x32x1f32_nonentry_noagpr(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_noagpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v31
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v32
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v33
+; GFX908-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -105,9 +1030,101 @@ bb:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_nonentry_with_agpr:
-; GCN: v_mfma_f32_32x32x1{{.*}} a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9:]+}}, a[{{[0-9:]+}}]
define void @test_mfma_f32_32x32x1f32_nonentry_with_agpr(ptr addrspace(1) %arg) #3 {
+; GFX908-LABEL: test_mfma_f32_32x32x1f32_nonentry_with_agpr:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[30:33], v[0:1], off offset:112
+; GFX908-NEXT: global_load_dwordx4 v[26:29], v[0:1], off offset:96
+; GFX908-NEXT: global_load_dwordx4 v[22:25], v[0:1], off offset:80
+; GFX908-NEXT: global_load_dwordx4 v[18:21], v[0:1], off offset:64
+; GFX908-NEXT: global_load_dwordx4 v[14:17], v[0:1], off offset:48
+; GFX908-NEXT: global_load_dwordx4 v[10:13], v[0:1], off offset:32
+; GFX908-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
+; GFX908-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v6
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v7
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v8
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v9
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v10
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v11
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v12
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v13
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v14
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v15
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v16
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v17
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v18
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v19
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v20
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v21
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v22
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v23
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v24
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v25
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v26
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v27
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v28
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v29
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v30
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v31
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v32
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v33
+; GFX908-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v3, 2.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a27
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a31
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a0
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:96
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a4
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:112
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[10:13], off offset:64
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[14:17], off offset:80
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[18:21], off offset:32
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[22:25], off offset:48
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[26:29], off
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
bb:
%in.1 = load <32 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %in.1, i32 0, i32 0, i32 0)
@@ -121,3 +1138,6 @@ attributes #0 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2
attributes #1 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
attributes #2 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-agpr-alloc"="0" }
attributes #3 = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX90A: {{.*}}