From 4f7c402d9ff1b2c908b97b78baf84157f08745e8 Mon Sep 17 00:00:00 2001 From: Saiyedul Islam Date: Tue, 9 Jan 2024 17:31:42 +0530 Subject: [AMDGPU][NFC] Update left over tests for COV5 (#76984) Update AMDGPU CodeGen lit tests to check for COV5 ABI. --- llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 11 +++-- .../AMDGPU/call-alias-register-usage-agpr.ll | 9 ++-- .../CodeGen/AMDGPU/call-alias-register-usage0.ll | 5 ++- .../CodeGen/AMDGPU/call-alias-register-usage1.ll | 5 ++- .../CodeGen/AMDGPU/call-alias-register-usage2.ll | 5 ++- .../CodeGen/AMDGPU/call-alias-register-usage3.ll | 5 ++- .../test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll | 9 ++-- llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 5 ++- .../CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll | 31 ++++++++----- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 18 +++++--- .../CodeGen/AMDGPU/promote-alloca-calling-conv.ll | 5 ++- llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll | 51 ++++++++++++---------- llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll | 9 ++-- 13 files changed, 106 insertions(+), 62 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll index d838846..a374689 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=OPT %s -; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=NOOPT %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV4 %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV5 %s ; Check that AMDGPUAttributor is not run with -O0. ; OPT: .amdhsa_user_sgpr_private_segment_buffer 1 @@ -18,7 +19,8 @@ ; NOOPT: .amdhsa_user_sgpr_private_segment_buffer 1 ; NOOPT: .amdhsa_user_sgpr_dispatch_ptr 1 -; NOOPT: .amdhsa_user_sgpr_queue_ptr 1 +; COV4: .amdhsa_user_sgpr_queue_ptr 1 +; COV5: .amdhsa_user_sgpr_queue_ptr 0 ; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1 ; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0 @@ -32,3 +34,6 @@ define amdgpu_kernel void @foo() { ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 CODE_OBJECT_VERSION} diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll index 6ff2dba..72bb515 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -9,10 +9,10 @@ ; ALL-LABEL: {{^}}kernel: ; GFX908: .amdhsa_next_free_vgpr 32 -; GFX908-NEXT: .amdhsa_next_free_sgpr 36 +; GFX908-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A: .amdhsa_next_free_vgpr 65 -; GFX90A-NEXT: .amdhsa_next_free_sgpr 36 +; GFX90A: .amdhsa_next_free_vgpr 59 +; GFX90A-NEXT: .amdhsa_next_free_sgpr 33 ; GFX90A-NEXT: .amdhsa_accum_offset 32 define amdgpu_kernel void @kernel() #0 { bb: @@ -29,3 +29,6 @@ bb: attributes #0 = { noinline norecurse nounwind optnone } attributes #1 = { noinline norecurse nounwind readnone willreturn } attributes #2 = { nounwind readnone willreturn } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll index 7973765..6afc906 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel0: ; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 36 +; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel0() #0 { bb: call void @alias0() #2 @@ -24,3 +24,6 @@ bb: attributes #0 = { noinline norecurse nounwind optnone } attributes #1 = { noinline norecurse nounwind readnone willreturn } attributes #2 = { nounwind readnone willreturn } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll index 79bb2fb..137bb13 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: {{^}}kernel1: ; CHECK: .amdhsa_next_free_vgpr 41 -; CHECK-NEXT: .amdhsa_next_free_sgpr 36 +; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel1() #0 { bb: call void asm sideeffect "; clobber v40 ", "~{v40}"() @@ -27,3 +27,6 @@ bb: attributes #0 = { noinline norecurse nounwind optnone } attributes #1 = { noinline norecurse nounwind readnone willreturn "amdgpu-waves-per-eu"="8,10" } attributes #2 = { nounwind readnone willreturn } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll index 5745dd9..2800ed6 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel2: ; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 36 +; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel2() #0 { bb: call void @alias2() #2 @@ -24,3 +24,6 @@ bb: attributes #0 = { noinline norecurse nounwind optnone } attributes #1 = { noinline norecurse nounwind readnone willreturn "amdgpu-waves-per-eu"="4,10" } attributes #2 = { nounwind readnone willreturn } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll index b922297..f7c0a57 100644 --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel3: ; CHECK: .amdhsa_next_free_vgpr 253 -; CHECK-NEXT: .amdhsa_next_free_sgpr 36 +; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel3() #0 { bb: call void @alias3() #2 @@ -24,3 +24,6 @@ bb: attributes #0 = { noinline norecurse nounwind optnone } attributes #1 = { noinline norecurse nounwind readnone willreturn "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,1" } attributes #2 = { nounwind readnone willreturn } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll index 084b968..ce478d4 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll @@ -28,7 +28,6 @@ define i32 @divergent_lshr_and_cmp(i32 %x) { entry: %0 = and i32 %x, 2 %1 = icmp ne i32 %0, 0 - ; Prevent removal of truncate in SDag by inserting llvm.amdgcn.if br i1 %1, label %out.true, label %out.else out.true: @@ -43,9 +42,9 @@ define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 % ; GCN-LABEL: name: uniform_opt_lshr_and_cmp ; GCN: bb.0.entry: ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: liveins: $sgpr2_sgpr3 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr2_sgpr3 ; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]] @@ -84,7 +83,6 @@ define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 % entry: %0 = and i32 %x, 2 %1 = icmp ne i32 %0, 0 - ; Don't optimize the truncate in the SDag away. br i1 %1, label %out.true, label %out.else out.true: @@ -96,3 +94,6 @@ out.else: store i1 %1, ptr addrspace(1) %out ret void } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index e2e1eff..3be2d94 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -49,7 +49,7 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) { ; R600: |PV.{{[XYZW]}}| ; R600: -PV -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; SI: s_load_dwordx2 s[0:1], s[2:3], 0x9 define amdgpu_kernel void @fneg_fabsf_fn_free_f32(ptr addrspace(1) %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @fabsf(float %bc) @@ -109,3 +109,6 @@ declare float @fabsf(float) readnone declare float @llvm.fabs.f32(float) readnone declare <2 x float> @llvm.fabs.v2f32(<2 x float>) readnone declare <4 x float> @llvm.fabs.v4f32(<4 x float>) readnone + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll index ea5add0..3973cf1 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll @@ -36,6 +36,7 @@ ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GCN-NEXT: .amdhsa_wavefront_size32 +; GCN-NEXT: .amdhsa_uses_dynamic_stack 0 ; GCN-NEXT: .amdhsa_enable_private_segment 0 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 @@ -64,6 +65,7 @@ define amdgpu_kernel void @minimal_kernel_inputs() { ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GCN-NEXT: .amdhsa_wavefront_size32 +; GCN-NEXT: .amdhsa_uses_dynamic_stack 0 ; GCN-NEXT: .amdhsa_enable_private_segment 1 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 @@ -81,7 +83,7 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() { } ; GCN-LABEL: {{^}}queue_ptr: -; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1] +; GCN: global_load_u8 v{{[0-9]+}}, ; WORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s15 ; NOWORKAROUND: v_mov_b32_e32 [[V:v[0-9]+]], s2 @@ -91,11 +93,12 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() { ; WORKAROUND: .amdhsa_user_sgpr_count 15 ; NOWORKAROUND: .amdhsa_user_sgpr_count 2 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1 -; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GCN-NEXT: .amdhsa_wavefront_size32 +; GCN-NEXT: .amdhsa_uses_dynamic_stack 0 ; GCN-NEXT: .amdhsa_enable_private_segment 0 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 0 @@ -117,16 +120,16 @@ define amdgpu_kernel void @queue_ptr() { ; WORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s14 ; WORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s15 -; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s8 -; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s9 -; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s10 +; NOWORKAROUND: v_mov_b32_e32 [[V_X:v[0-9]+]], s6 +; NOWORKAROUND: v_mov_b32_e32 [[V_Y:v[0-9]+]], s7 +; NOWORKAROUND: v_mov_b32_e32 [[V_Z:v[0-9]+]], s8 ; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[0:1] +; GCN: global_load_u8 v{{[0-9]+}}, ; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[2:3] -; GCN: global_load_u8 v{{[0-9]+}}, v{{[0-9]+}}, s[4:5] -; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s6 -; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s7 +; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_LO:[0-9]+]], s4 +; GCN-DAG: v_mov_b32_e32 v[[DISPATCH_HI:[0-9]+]], s5 ; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_X]], off ; GCN: global_store_b32 v{{\[[0-9]+:[0-9]+\]}}, [[V_Y]], off @@ -135,13 +138,14 @@ define amdgpu_kernel void @queue_ptr() { ; GCN: .amdhsa_kernel all_inputs ; WORKAROUND: .amdhsa_user_sgpr_count 13 -; NOWORKAROUND: .amdhsa_user_sgpr_count 8 +; NOWORKAROUND: .amdhsa_user_sgpr_count 6 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 ; GCN-NEXT: .amdhsa_wavefront_size32 +; GCN-NEXT: .amdhsa_uses_dynamic_stack 0 ; GCN-NEXT: .amdhsa_enable_private_segment 1 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_x 1 ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_id_y 1 @@ -149,7 +153,7 @@ define amdgpu_kernel void @queue_ptr() { ; GCN-NEXT: .amdhsa_system_sgpr_workgroup_info 0 ; GCN-NEXT: .amdhsa_system_vgpr_workitem_id 0 ; WORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 13 -; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 8 +; NOWORKAROUND: ; COMPUTE_PGM_RSRC2:USER_SGPR: 6 define amdgpu_kernel void @all_inputs() { %alloca = alloca i32, addrspace(5) store volatile i32 0, ptr addrspace(5) %alloca @@ -188,3 +192,6 @@ declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #0 declare i64 @llvm.amdgcn.dispatch.id() #0 attributes #0 = { nounwind readnone speculatable willreturn } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll index 7479fc8..2672c12 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll @@ -1,15 +1,16 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIT %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,CIH %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}is_local_vgpr: ; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[[0-9]+}}:[[PTR_HI:[0-9]+]]] -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 +; CI-DAG: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base ; GFX9: v_cmp_eq_u32_e32 vcc, s[[HI]], v[[PTR_HI]] -; CI: v_cmp_eq_u32_e32 vcc, [[APERTURE]], v[[PTR_HI]] +; CIT: v_cmp_eq_u32_e32 vcc, s4, v[[PTR_HI]] +; CIH: v_cmp_eq_u32_e32 vcc, s2, v[[PTR_HI]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -25,15 +26,15 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) { ; select and vcc branch. ; GCN-LABEL: {{^}}is_local_sgpr: -; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} +; CI-DAG: s_load_dword s0, s[4:5], 0x1 -; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[6:7], 0x1{{$}} +; CI-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x33{{$}} ; GFX9-DAG: s_load_dword [[PTR_HI:s[0-9]+]], s[4:5], 0x4{{$}} ; GFX9: s_mov_b64 s[{{[0-9]+}}:[[HI:[0-9]+]]], src_shared_base ; GFX9: s_cmp_eq_u32 [[PTR_HI]], s[[HI]] -; CI: s_cmp_eq_u32 [[PTR_HI]], [[APERTURE]] +; CI: s_cmp_eq_u32 s0, [[PTR_HI]] ; GCN: s_cbranch_vccnz define amdgpu_kernel void @is_local_sgpr(ptr %ptr) { %val = call i1 @llvm.amdgcn.is.shared(ptr %ptr) @@ -51,3 +52,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 declare i1 @llvm.amdgcn.is.shared(ptr nocapture) #0 attributes #0 = { nounwind readnone speculatable } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll index ec83d7f..a8bb36b 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll @@ -77,7 +77,7 @@ declare i32 @foo(ptr addrspace(5)) #0 ; ASM: buffer_store_dword ; ASM: buffer_store_dword ; ASM: s_swappc_b64 -; ASM: ScratchSize: 16400 +; ASM: ScratchSize: 16 define amdgpu_kernel void @call_private(ptr addrspace(1) %out, i32 %in) #0 { entry: %tmp = alloca [2 x i32], addrspace(5) @@ -94,3 +94,6 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" } attributes #1 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll index ecdc384..7d7917e 100644 --- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll @@ -24,7 +24,7 @@ define amdgpu_kernel void @volatile_load_group_size_x(ptr addrspace(1) %out) #0 } ; CHECK-LABEL: @load_group_size_x( -; CHECK-NEXT: store i16 8, +; CHECK: store i16 %group.size.x, define amdgpu_kernel void @load_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 @@ -34,7 +34,7 @@ define amdgpu_kernel void @load_group_size_x(ptr addrspace(1) %out) #0 !reqd_wor } ; CHECK-LABEL: @load_group_size_y( -; CHECK-NEXT: store i16 16, +; CHECK: store i16 %group.size.y, define amdgpu_kernel void @load_group_size_y(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 6 @@ -44,7 +44,7 @@ define amdgpu_kernel void @load_group_size_y(ptr addrspace(1) %out) #0 !reqd_wor } ; CHECK-LABEL: @load_group_size_z( -; CHECK-NEXT: store i16 2, +; CHECK: store i16 %group.size.z, define amdgpu_kernel void @load_group_size_z(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.z = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 8 @@ -55,7 +55,7 @@ define amdgpu_kernel void @load_group_size_z(ptr addrspace(1) %out) #0 !reqd_wor ; Metadata uses i64 instead of i32 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i64( -; CHECK-NEXT: store i16 8, +; CHECK: store i16 %group.size.x, define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(ptr addrspace(1) %out) #0 !reqd_work_group_size !2 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 @@ -66,7 +66,7 @@ define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(ptr addrsp ; Metadata uses i16 instead of i32 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i16( -; CHECK-NEXT: store i16 8, +; CHECK: store i16 %group.size.x, define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(ptr addrspace(1) %out) #0 !reqd_work_group_size !3 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 @@ -76,7 +76,7 @@ define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(ptr addrsp } ; CHECK-LABEL: @use_local_size_x_8_16_2( -; CHECK-NEXT: store i64 8, +; CHECK: store i64 %zext, define amdgpu_kernel void @use_local_size_x_8_16_2(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 @@ -94,7 +94,7 @@ define amdgpu_kernel void @use_local_size_x_8_16_2(ptr addrspace(1) %out) #0 !re } ; CHECK-LABEL: @use_local_size_y_8_16_2( -; CHECK-NEXT: store i64 16, +; CHECK: store i64 %zext, define amdgpu_kernel void @use_local_size_y_8_16_2(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.y = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 6 @@ -112,7 +112,7 @@ define amdgpu_kernel void @use_local_size_y_8_16_2(ptr addrspace(1) %out) #0 !re } ; CHECK-LABEL: @use_local_size_z_8_16_2( -; CHECK-NEXT: store i64 2, +; CHECK: store i64 %zext, define amdgpu_kernel void @use_local_size_z_8_16_2(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.z = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 8 @@ -134,7 +134,7 @@ define amdgpu_kernel void @use_local_size_z_8_16_2(ptr addrspace(1) %out) #0 !re ; CHECK-LABEL: @local_size_x_8_16_2_wrong_group_id( ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y() -; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3 +; CHECK: %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 @@ -154,7 +154,7 @@ define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(ptr addrspace(1) % ; CHECK-LABEL: @local_size_x_8_16_2_wrong_grid_size( ; CHECK: %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() -; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3 +; CHECK: %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 @@ -174,9 +174,9 @@ define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(ptr addrspace(1) % ; CHECK-LABEL: @local_size_x_8_16_2_wrong_cmp_type( ; CHECK: %grid.size.x = load i32, ptr addrspace(4) %gep.grid.size.x, align 4 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() -; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3 +; CHECK: %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x -; CHECK: %smin = call i32 @llvm.smin.i32(i32 %sub, i32 8) +; CHECK: %smin = call i32 @llvm.smin.i32(i32 %sub, i32 %group.size.x.zext) define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 @@ -194,9 +194,9 @@ define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(ptr addrspace(1) % } ; CHECK-LABEL: @local_size_x_8_16_2_wrong_select( -; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3 +; CHECK: %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x -; CHECK: %umax = call i32 @llvm.umax.i32(i32 %sub, i32 8) +; CHECK: %umax = call i32 @llvm.umax.i32(i32 %sub, i32 %group.size.x.zext) ; CHECK: %zext = zext i32 %umax to i64 define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() @@ -218,7 +218,7 @@ define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(ptr addrspace(1) %ou ; CHECK: %grid.size.x = load i16, ptr addrspace(4) %gep.grid.size.x, align 4 ; CHECK: %grid.size.x.zext = zext i16 %grid.size.x to i32 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x() -; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3 +; CHECK: %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext ; CHECK: %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() @@ -238,7 +238,7 @@ define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(ptr addr } ; CHECK-LABEL: @func_group_size_x( -; CHECK-NEXT: ret i32 8 +; CHECK: ret i32 %zext define i32 @func_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 @@ -248,7 +248,7 @@ define i32 @func_group_size_x(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 } ; CHECK-LABEL: @__ockl_get_local_size_reqd_size( -; CHECK: %group.size = phi i32 [ 2, %bb17 ], [ 16, %bb9 ], [ 8, %bb1 ], [ 1, %bb ] +; CHECK: %group.size = phi i16 [ %tmp24, %bb17 ], [ %tmp16, %bb9 ], [ %tmp8, %bb1 ], [ 1, %bb ] define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size !0 { bb: %tmp = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #2 @@ -295,9 +295,9 @@ bb25: ; preds = %bb17, %bb9, %bb1, % } ; CHECK-LABEL: @all_local_size( -; CHECK-NEXT: store volatile i64 8, ptr addrspace(1) %out, align 4 -; CHECK-NEXT: store volatile i64 16, ptr addrspace(1) %out, align 4 -; CHECK-NEXT: store volatile i64 2, ptr addrspace(1) %out, align 4 +; CHECK: store volatile i64 %tmp34.i, ptr addrspace(1) %out, align 4 +; CHECK-NEXT: store volatile i64 %tmp34.i14, ptr addrspace(1) %out, align 4 +; CHECK-NEXT: store volatile i64 %tmp34.i7, ptr addrspace(1) %out, align 4 define amdgpu_kernel void @all_local_size(ptr addrspace(1) nocapture readnone %out) #0 !reqd_work_group_size !0 { %tmp.i = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0 %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0 @@ -376,8 +376,8 @@ define amdgpu_kernel void @load_group_size_xy_i32(ptr addrspace(1) %out) #0 !req } ; CHECK-LABEL: @load_group_size_x_y_multiple_dispatch_ptr( -; CHECK-NEXT: store volatile i16 8, ptr addrspace(1) %out, align 2 -; CHECK-NEXT: store volatile i16 16, ptr addrspace(1) %out, align 2 +; CHECK: store volatile i16 %group.size.x, ptr addrspace(1) %out, align 2 +; CHECK: store volatile i16 %group.size.y, ptr addrspace(1) %out, align 2 define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { %dispatch.ptr0 = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr0, i64 4 @@ -396,8 +396,8 @@ define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(ptr addrspa ; CHECK-NEXT: %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 ; CHECK-NEXT: %group.size.x = load i16, ptr addrspace(4) %gep.group.size.x, align 4 -; CHECK-NEXT: %zext = zext i16 %group.size.x to i64 -; CHECK-NEXT: store i64 %zext, ptr addrspace(1) %out, align 4 +; CHECK: %group.size.x.zext = zext i16 %group.size.x to i32 +; CHECK: store i64 %zext, ptr addrspace(1) %out define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(ptr addrspace(1) %out) #2 { %dispatch.ptr = tail call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() %gep.group.size.x = getelementptr inbounds i8, ptr addrspace(4) %dispatch.ptr, i64 4 @@ -456,3 +456,6 @@ attributes #3 = { nounwind "uniform-work-group-size"="false" } !1 = !{i32 8, i32 16} !2 = !{i64 8, i64 16, i64 2} !3 = !{i16 8, i16 16, i16 2} + +!llvm.module.flags = !{!4} +!4 = !{i32 1, !"amdgpu_code_object_version", i32 500} diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll index dcc90c0..e7c5aaf 100644 --- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll @@ -43,9 +43,9 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; GFX9-LABEL: test_simple_indirect_call: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4 -; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: s_add_u32 s0, s0, s17 +; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15 +; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0 +; GFX9-NEXT: s_add_u32 s0, s0, s15 ; GFX9-NEXT: s_addc_u32 s1, s1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 @@ -76,3 +76,6 @@ define amdgpu_kernel void @test_simple_indirect_call() { ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" } ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" } ;. + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"amdgpu_code_object_version", i32 500} -- cgit v1.1