aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/spill-agpr.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-agpr.ll482
1 files changed, 422 insertions, 60 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
index 6afef91..eb0d546 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll
@@ -1,15 +1,107 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,GFX90A %s
-; GCN-LABEL: {{^}}max_12regs_13a_used:
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
-; GCN-NOT: buffer_store_dword
-; GCN-NOT: buffer_load_dword
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
-; GCN: ScratchSize: 0
define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 {
+; GFX908-LABEL: max_12regs_13a_used:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, s8
+; GFX908-NEXT: v_mov_b32_e32 v1, s9
+; GFX908-NEXT: v_mov_b32_e32 v2, s10
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v5
+; GFX908-NEXT: v_mov_b32_e32 v5, s11
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3]
+; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v0, v0, a[0:3]
+; GFX908-NEXT: s_cbranch_scc0 .LBB0_2
+; GFX908-NEXT: ; %bb.1: ; %st
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_endpgm
+; GFX908-NEXT: .LBB0_2: ; %use
+; GFX908-NEXT: s_nop 2
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a6
+; GFX908-NEXT: v_accvgpr_write_b32 a4, 4
+; GFX908-NEXT: v_accvgpr_write_b32 a8, 5
+; GFX908-NEXT: v_accvgpr_write_b32 a9, 1
+; GFX908-NEXT: v_accvgpr_write_b32 a10, 2
+; GFX908-NEXT: v_accvgpr_write_b32 a11, 3
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_mov_b32_e32 v4, 0
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: max_12regs_13a_used:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3]
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v0, v0, a[0:3]
+; GFX90A-NEXT: s_cbranch_scc0 .LBB0_2
+; GFX90A-NEXT: ; %bb.1: ; %st
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_endpgm
+; GFX90A-NEXT: .LBB0_2: ; %use
+; GFX90A-NEXT: s_nop 3
+; GFX90A-NEXT: v_accvgpr_read_b32 v9, a7
+; GFX90A-NEXT: v_accvgpr_read_b32 v8, a6
+; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5
+; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, 4
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, 5
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, 1
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, 2
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, 3
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_endpgm
bb:
%in.1 = load <4 x float>, ptr addrspace(1) %arg
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
@@ -28,16 +120,64 @@ st:
call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2)
ret void
}
-
-; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}}
-; GCN-NOT: buffer_store_dword
-; GCN-NOT: buffer_load_dword
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
; GCN: ScratchSize: 0
+
define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
+; GFX908-LABEL: max_10_vgprs_used_9a:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a1
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v1
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: max_10_vgprs_used_9a:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3
+; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_endpgm
%a1 = call <4 x i32> asm sideeffect "", "=a"()
%a2 = call <4 x i32> asm sideeffect "", "=a"()
%a3 = call i32 asm sideeffect "", "=a"()
@@ -46,17 +186,168 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 {
call void asm sideeffect "", "a"(<2 x i32> %a4)
ret void
}
-
-; GCN-LABEL: {{^}}max_32regs_mfma32:
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-; GCN-NOT: buffer_store_dword
-; GCN: v_accvgpr_read_b32
-; GCN: v_mfma_f32_32x32x1f32
-; GCN-NOT: buffer_load_dword
-; GCN: v_accvgpr_write_b32
; GCN: ScratchSize: 0
+
define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 {
+; GFX908-LABEL: max_32regs_mfma32:
+; GFX908: ; %bb.0: ; %bb
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x40c00000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x40e00000
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x40a00000
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41000000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41100000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41200000
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41300000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41400000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41500000
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41600000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41700000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41800000
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41880000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41900000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41980000
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41a00000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41a80000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41b00000
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41b80000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41c00000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41c80000
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41d00000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41d80000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41e00000
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, 0x41e80000
+; GFX908-NEXT: v_mov_b32_e32 v3, 0x41f00000
+; GFX908-NEXT: v_mov_b32_e32 v4, 0x41f80000
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, 2.0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, 4.0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v4
+; GFX908-NEXT: v_accvgpr_write_b32 a31, 2.0
+; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31]
+; GFX908-NEXT: v_mov_b32_e32 v0, 0
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v5
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: max_32regs_mfma32:
+; GFX90A: ; %bb.0: ; %bb
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40400000
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40a00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40c00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40e00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41000000
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41100000
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41200000
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41300000
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41400000
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41500000
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41600000
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41700000
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41800000
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41880000
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41900000
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41980000
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41a00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41a80000
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41b00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41b80000
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41c00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41c80000
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41d00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41d80000
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41e00000
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41e80000
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f00000
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f80000
+; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, 4.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2
+; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a1
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31]
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_store_dword v0, a0, s[2:3]
+; GFX90A-NEXT: s_endpgm
bb:
%v = call i32 asm sideeffect "", "=a"()
br label %use
@@ -68,42 +359,110 @@ use:
store float %elt1, ptr addrspace(1) %arg
ret void
}
+; GCN: ScratchSize: 0
; Should spill agprs to memory for both gfx908 and gfx90a.
-; GCN-LABEL: {{^}}max_6regs_used_8a:
-; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
-
-; GFX908-DAG: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
-; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
-; GFX908-DAG: v_accvgpr_read_b32 v5, a1 ; Reload Reuse
-; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
-; GFX908-DAG: v_accvgpr_read_b32 v5, a2 ; Reload Reuse
-; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
-; GFX908-DAG: v_accvgpr_read_b32 v5, a3 ; Reload Reuse
-; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
-
-; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill
-; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill
-; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill
-; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill
-
-; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3]
-
-; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Reload
-; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload
-; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload
-; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload
-; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off
-
-; GFX90A-DAG: buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
-; GFX90A-DAG: buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
-; GFX90A-DAG: buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
-; GFX90A-DAG: buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
-; GFX90A: global_store_dwordx4 v[0:1], v[2:5], off
-
-; GCN: ScratchSize: 20
define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
+; GFX908-LABEL: max_6regs_used_8a:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX908-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX908-NEXT: s_mov_b32 s6, -1
+; GFX908-NEXT: s_mov_b32 s7, 0xe00000
+; GFX908-NEXT: s_add_u32 s4, s4, s3
+; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def v1
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; def a[0:3]
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_addc_u32 s5, s5, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v1
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 ; 4-byte Folded Spill
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a1 ; Reload Reuse
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 ; Reload Reuse
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a3 ; Reload Reuse
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill
+; GFX908-NEXT: s_waitcnt vmcnt(4)
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v3
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3]
+; GFX908-NEXT: s_nop 3
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX908-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; 4-byte Folded Reload
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
+; GFX908-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
+; GFX908-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a4
+; GFX908-NEXT: ;;#ASMSTART
+; GFX908-NEXT: ; use v0
+; GFX908-NEXT: ;;#ASMEND
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: max_6regs_used_8a:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX90A-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX90A-NEXT: s_mov_b32 s6, -1
+; GFX90A-NEXT: s_mov_b32 s7, 0xe00000
+; GFX90A-NEXT: s_add_u32 s4, s4, s3
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_addc_u32 s5, s5, 0
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def v1
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; def a[0:3]
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: buffer_store_dword a0, off, s[4:7], 0 ; 4-byte Folded Spill
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_store_dword a1, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a2, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill
+; GFX90A-NEXT: buffer_store_dword a3, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[2:3]
+; GFX90A-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3]
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3]
+; GFX90A-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload
+; GFX90A-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: ;;#ASMSTART
+; GFX90A-NEXT: ; use v1
+; GFX90A-NEXT: ;;#ASMEND
+; GFX90A-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%v0 = call float asm sideeffect "; def $0", "=v"()
%a4 = call <4 x float> asm sideeffect "; def $0", "=a"()
@@ -115,6 +474,7 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 {
call void asm sideeffect "; use $0", "v"(float %v0);
ret void
}
+; GCN: ScratchSize: 20
declare i32 @llvm.amdgcn.workitem.id.x()
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
@@ -125,3 +485,5 @@ attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgp
attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
attributes #4 = { nounwind "amdgpu-num-vgpr"="6" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}