diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/spill-agpr.ll')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/spill-agpr.ll | 482 |
1 files changed, 422 insertions, 60 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 6afef91..eb0d546 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -1,15 +1,107 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -check-prefixes=GCN,GFX90A %s -; GCN-LABEL: {{^}}max_12regs_13a_used: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} -; GCN-NOT: buffer_store_dword -; GCN-NOT: buffer_load_dword -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] -; GCN: ScratchSize: 0 define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 { +; GFX908-LABEL: max_12regs_13a_used: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX908-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_cmp_lg_u32 s0, 0 +; GFX908-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: v_mov_b32_e32 v5, s8 +; GFX908-NEXT: v_mov_b32_e32 v1, s9 +; GFX908-NEXT: v_mov_b32_e32 v2, s10 +; GFX908-NEXT: v_accvgpr_write_b32 a0, v5 +; GFX908-NEXT: v_mov_b32_e32 v5, s11 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v5 +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3] +; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v0, v0, a[0:3] +; GFX908-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX908-NEXT: ; %bb.1: ; %st +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_endpgm +; GFX908-NEXT: .LBB0_2: ; %use +; GFX908-NEXT: s_nop 2 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a4 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a7 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a5 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a6 +; GFX908-NEXT: v_accvgpr_write_b32 a4, 4 +; GFX908-NEXT: v_accvgpr_write_b32 a8, 5 +; GFX908-NEXT: v_accvgpr_write_b32 a9, 1 +; GFX908-NEXT: v_accvgpr_write_b32 a10, 2 +; GFX908-NEXT: v_accvgpr_write_b32 a11, 3 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_write_b32 a7, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v1 +; GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GFX908-NEXT: v_mov_b32_e32 v2, v0 +; GFX908-NEXT: v_mov_b32_e32 v3, v0 +; GFX908-NEXT: v_mov_b32_e32 v4, 0 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: max_12regs_13a_used: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_cmp_lg_u32 s0, 0 +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_accvgpr_write_b32 a0, s8 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, s9 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, s10 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, s11 +; GFX90A-NEXT: s_nop 1 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3] +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[4:7], v0, v0, a[0:3] +; GFX90A-NEXT: s_cbranch_scc0 .LBB0_2 +; GFX90A-NEXT: ; %bb.1: ; %st +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_endpgm +; GFX90A-NEXT: .LBB0_2: ; %use +; GFX90A-NEXT: s_nop 3 +; GFX90A-NEXT: v_accvgpr_read_b32 v9, a7 +; GFX90A-NEXT: v_accvgpr_read_b32 v8, a6 +; GFX90A-NEXT: v_accvgpr_read_b32 v7, a5 +; GFX90A-NEXT: v_accvgpr_read_b32 v6, a4 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, 4 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, 5 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, 1 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, 2 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, 3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v6 +; GFX90A-NEXT: v_mov_b32_e32 v4, 0 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v7 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v8 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v9 +; GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GFX90A-NEXT: v_mov_b32_e32 v2, v0 +; GFX90A-NEXT: v_mov_b32_e32 v3, v0 +; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_endpgm bb: %in.1 = load <4 x float>, ptr addrspace(1) %arg %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) @@ -28,16 +120,64 @@ st: call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2) ret void } - -; GCN-LABEL: {{^}}max_10_vgprs_used_9a: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} -; GCN-NOT: buffer_store_dword -; GCN-NOT: buffer_load_dword -; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; GCN: ScratchSize: 0 + define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { +; GFX908-LABEL: max_10_vgprs_used_9a: +; GFX908: ; %bb.0: +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v4, a1 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v1 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v5 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: max_10_vgprs_used_9a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX90A-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX90A-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_read_b32 v5, a3 +; GFX90A-NEXT: v_accvgpr_read_b32 v4, a2 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, v1 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a0, v4 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v5 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_endpgm %a1 = call <4 x i32> asm sideeffect "", "=a"() %a2 = call <4 x i32> asm sideeffect "", "=a"() %a3 = call i32 asm sideeffect "", "=a"() @@ -46,17 +186,168 @@ define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { call void asm sideeffect "", "a"(<2 x i32> %a4) ret void } - -; GCN-LABEL: {{^}}max_32regs_mfma32: -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 -; GCN-NOT: buffer_store_dword -; GCN: v_accvgpr_read_b32 -; GCN: v_mfma_f32_32x32x1f32 -; GCN-NOT: buffer_load_dword -; GCN: v_accvgpr_write_b32 ; GCN: ScratchSize: 0 + define amdgpu_kernel void @max_32regs_mfma32(ptr addrspace(1) %arg) #3 { +; GFX908-LABEL: max_32regs_mfma32: +; GFX908: ; %bb.0: ; %bb +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x40c00000 +; GFX908-NEXT: v_mov_b32_e32 v4, 0x40e00000 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x40a00000 +; GFX908-NEXT: v_accvgpr_write_b32 a5, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a6, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41100000 +; GFX908-NEXT: v_mov_b32_e32 v4, 0x41200000 +; GFX908-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a8, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a9, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41300000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41400000 +; GFX908-NEXT: v_mov_b32_e32 v4, 0x41500000 +; GFX908-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a11, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a12, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41600000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41700000 +; GFX908-NEXT: v_mov_b32_e32 v4, 0x41800000 +; GFX908-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a14, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a15, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41880000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41900000 +; GFX908-NEXT: v_mov_b32_e32 v4, 0x41980000 +; GFX908-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a17, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a18, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41a00000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41a80000 +; GFX908-NEXT: v_mov_b32_e32 v4, 0x41b00000 +; GFX908-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a20, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a21, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41b80000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41c00000 +; GFX908-NEXT: v_mov_b32_e32 v4, 0x41c80000 +; GFX908-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a23, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a24, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41d00000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41d80000 +; GFX908-NEXT: v_mov_b32_e32 v4, 0x41e00000 +; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX908-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a26, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a27, v4 +; GFX908-NEXT: v_mov_b32_e32 v2, 0x41e80000 +; GFX908-NEXT: v_mov_b32_e32 v3, 0x41f00000 +; GFX908-NEXT: v_mov_b32_e32 v4, 0x41f80000 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 +; GFX908-NEXT: v_accvgpr_write_b32 a0, 1.0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, 2.0 +; GFX908-NEXT: v_accvgpr_write_b32 a3, 4.0 +; GFX908-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a29, v3 +; GFX908-NEXT: v_accvgpr_write_b32 a30, v4 +; GFX908-NEXT: v_accvgpr_write_b32 a31, 2.0 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31] +; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: s_nop 7 +; GFX908-NEXT: s_nop 5 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v5 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a0 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: global_store_dword v0, v1, s[2:3] +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: max_32regs_mfma32: +; GFX90A: ; %bb.0: ; %bb +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40400000 +; GFX90A-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40a00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a4, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40c00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a5, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x40e00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a6, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41000000 +; GFX90A-NEXT: v_accvgpr_write_b32 a7, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41100000 +; GFX90A-NEXT: v_accvgpr_write_b32 a8, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41200000 +; GFX90A-NEXT: v_accvgpr_write_b32 a9, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41300000 +; GFX90A-NEXT: v_accvgpr_write_b32 a10, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41400000 +; GFX90A-NEXT: v_accvgpr_write_b32 a11, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41500000 +; GFX90A-NEXT: v_accvgpr_write_b32 a12, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41600000 +; GFX90A-NEXT: v_accvgpr_write_b32 a13, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41700000 +; GFX90A-NEXT: v_accvgpr_write_b32 a14, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41800000 +; GFX90A-NEXT: v_accvgpr_write_b32 a15, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41880000 +; GFX90A-NEXT: v_accvgpr_write_b32 a16, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41900000 +; GFX90A-NEXT: v_accvgpr_write_b32 a17, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41980000 +; GFX90A-NEXT: v_accvgpr_write_b32 a18, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41a00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a19, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41a80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a20, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41b00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a21, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41b80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a22, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41c00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a23, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41c80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a24, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41d00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a25, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41d80000 +; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a26, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41e00000 +; GFX90A-NEXT: v_accvgpr_write_b32 a27, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41e80000 +; GFX90A-NEXT: v_accvgpr_write_b32 a28, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f00000 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_accvgpr_write_b32 a1, 2.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a29, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0x41f80000 +; GFX90A-NEXT: v_accvgpr_read_b32 v3, a0 +; GFX90A-NEXT: v_accvgpr_write_b32 a0, 1.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a3, 4.0 +; GFX90A-NEXT: v_accvgpr_write_b32 a30, v2 +; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a1 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v1, a[0:31] +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 7 +; GFX90A-NEXT: s_nop 2 +; GFX90A-NEXT: v_accvgpr_write_b32 a1, v3 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, a0, s[2:3] +; GFX90A-NEXT: s_endpgm bb: %v = call i32 asm sideeffect "", "=a"() br label %use @@ -68,42 +359,110 @@ use: store float %elt1, ptr addrspace(1) %arg ret void } +; GCN: ScratchSize: 0 ; Should spill agprs to memory for both gfx908 and gfx90a. -; GCN-LABEL: {{^}}max_6regs_used_8a: -; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 -; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 - -; GFX908-DAG: v_accvgpr_read_b32 v5, a0 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill -; GFX908-DAG: v_accvgpr_read_b32 v5, a1 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill -; GFX908-DAG: v_accvgpr_read_b32 v5, a2 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill -; GFX908-DAG: v_accvgpr_read_b32 v5, a3 ; Reload Reuse -; GFX908-DAG: buffer_store_dword v5, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill - -; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Spill -; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill -; GFX90A-DAG: buffer_store_dword a2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill -; GFX90A-DAG: buffer_store_dword a3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill - -; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] - -; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 ; 4-byte Folded Reload -; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload -; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload -; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload -; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off - -; GFX90A-DAG: buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload -; GFX90A-DAG: buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload -; GFX90A-DAG: buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload -; GFX90A-DAG: buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload -; GFX90A: global_store_dwordx4 v[0:1], v[2:5], off - -; GCN: ScratchSize: 20 define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 { +; GFX908-LABEL: max_6regs_used_8a: +; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX908-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX908-NEXT: s_mov_b32 s6, -1 +; GFX908-NEXT: s_mov_b32 s7, 0xe00000 +; GFX908-NEXT: s_add_u32 s4, s4, s3 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def v1 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; def a[0:3] +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_addc_u32 s5, s5, 0 +; GFX908-NEXT: v_accvgpr_write_b32 a4, v1 +; GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GFX908-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] +; GFX908-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v5, a1 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v5, a2 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill +; GFX908-NEXT: v_accvgpr_read_b32 v5, a3 ; Reload Reuse +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: buffer_store_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill +; GFX908-NEXT: s_waitcnt vmcnt(4) +; GFX908-NEXT: v_accvgpr_write_b32 a0, v0 +; GFX908-NEXT: v_accvgpr_write_b32 a1, v1 +; GFX908-NEXT: v_accvgpr_write_b32 a2, v2 +; GFX908-NEXT: v_accvgpr_write_b32 a3, v3 +; GFX908-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v0, v0, a[0:3] +; GFX908-NEXT: s_nop 3 +; GFX908-NEXT: v_accvgpr_read_b32 v0, a0 +; GFX908-NEXT: v_accvgpr_read_b32 v1, a1 +; GFX908-NEXT: v_accvgpr_read_b32 v2, a2 +; GFX908-NEXT: v_accvgpr_read_b32 v3, a3 +; GFX908-NEXT: s_nop 1 +; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX908-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; 4-byte Folded Reload +; GFX908-NEXT: s_nop 0 +; GFX908-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload +; GFX908-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: global_store_dwordx4 v[0:1], v[0:3], off +; GFX908-NEXT: s_waitcnt vmcnt(0) +; GFX908-NEXT: v_accvgpr_read_b32 v0, a4 +; GFX908-NEXT: ;;#ASMSTART +; GFX908-NEXT: ; use v0 +; GFX908-NEXT: ;;#ASMEND +; GFX908-NEXT: s_endpgm +; +; GFX90A-LABEL: max_6regs_used_8a: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX90A-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX90A-NEXT: s_mov_b32 s6, -1 +; GFX90A-NEXT: s_mov_b32 s7, 0xe00000 +; GFX90A-NEXT: s_add_u32 s4, s4, s3 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX90A-NEXT: s_addc_u32 s5, s5, 0 +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; def a[0:3] +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: buffer_store_dword a0, off, s[4:7], 0 ; 4-byte Folded Spill +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: buffer_store_dword a1, off, s[4:7], 0 offset:4 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a2, off, s[4:7], 0 offset:8 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword a3, off, s[4:7], 0 offset:12 ; 4-byte Folded Spill +; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_load_dwordx4 a[0:3], v0, s[2:3] +; GFX90A-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: v_mfma_f32_4x4x1f32 a[0:3], v2, v2, a[0:3] +; GFX90A-NEXT: s_nop 4 +; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[2:3] +; GFX90A-NEXT: buffer_load_dword v2, off, s[4:7], 0 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v4, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload +; GFX90A-NEXT: buffer_load_dword v5, off, s[4:7], 0 offset:12 ; 4-byte Folded Reload +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: ;;#ASMSTART +; GFX90A-NEXT: ; use v1 +; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %v0 = call float asm sideeffect "; def $0", "=v"() %a4 = call <4 x float> asm sideeffect "; def $0", "=a"() @@ -115,6 +474,7 @@ define amdgpu_kernel void @max_6regs_used_8a(ptr addrspace(1) %arg) #4 { call void asm sideeffect "; use $0", "v"(float %v0); ret void } +; GCN: ScratchSize: 20 declare i32 @llvm.amdgcn.workitem.id.x() declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) @@ -125,3 +485,5 @@ attributes #1 = { nounwind "amdgpu-num-vgpr"="10" "amdgpu-no-dispatch-id" "amdgp attributes #2 = { nounwind "amdgpu-num-vgpr"="12" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #3 = { nounwind "amdgpu-num-vgpr"="32" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } attributes #4 = { nounwind "amdgpu-num-vgpr"="6" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} |