diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll | 14 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll | 519 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/callbr.ll | 54 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll | 51 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/infinite-loop.ll | 257 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/private-function.ll | 16 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir | 22 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll | 100 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll | 161 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/update-phi.ll | 39 | 
12 files changed, 1192 insertions, 45 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll index e86f747..37b5422 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll @@ -1,11 +1,11 @@  ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s -; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s +; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s  ; Note: we use MIR test checks + stop after legalizer to prevent  ; tests from being optimized out. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll index 44b12a9..61a6137 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll @@ -1,5 +1,5 @@  ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s  declare void @readsMem(ptr) #0  declare void @writesMem(ptr) #1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll new file mode 100644 index 0000000..06150e42 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll @@ -0,0 +1,519 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s + +; These situations are "special" in that they either have an alloca that is not +; in the entry block or that they have a dynamic alloca. Both situations affect +; prolog/epilog generation. + +declare amdgpu_gfx void @foo() + +define amdgpu_cs_chain void @test_alloca() { +; GFX12-LABEL: test_alloca: +; GFX12:       ; %bb.0: ; %.entry +; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:    s_wait_expcnt 0x0 +; GFX12-NEXT:    s_wait_samplecnt 0x0 +; GFX12-NEXT:    s_wait_bvhcnt 0x0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    v_mov_b32_e32 v0, 0 +; GFX12-NEXT:    s_mov_b32 s32, 16 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_mov_b32 s0, s32 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_i32 s32, s0, 0x200 +; GFX12-NEXT:    scratch_store_b32 off, v0, s0 +; GFX12-NEXT:    s_endpgm +; +; GFX942-LABEL: test_alloca: +; GFX942:       ; %bb.0: ; %.entry +; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:    s_mov_b32 s32, 16 +; GFX942-NEXT:    s_mov_b32 s0, s32 +; GFX942-NEXT:    v_mov_b32_e32 v0, 0 +; GFX942-NEXT:    s_add_i32 s32, s0, 0x400 +; GFX942-NEXT:    scratch_store_dword off, v0, s0 +; GFX942-NEXT:    s_endpgm +.entry: +  br label %SW_C + +SW_C:                                             ; preds = %.entry +  %v = alloca i32, i32 1, align 4, addrspace(5) +  store i32 0, ptr addrspace(5) %v, align 4 +  ret void +} + +define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_alloca_var_uniform: +; GFX12:       ; %bb.0: +; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:    s_wait_expcnt 0x0 +; GFX12-NEXT:    s_wait_samplecnt 0x0 +; GFX12-NEXT:    s_wait_bvhcnt 0x0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_lshl_b32 s0, s0, 2 +; GFX12-NEXT:    v_mov_b32_e32 v0, 0 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_i32 s0, s0, 15 +; GFX12-NEXT:    s_mov_b32 s32, 16 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_and_b32 s0, s0, -16 +; GFX12-NEXT:    s_mov_b32 s1, s32 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_lshl_b32 s0, s0, 5 +; GFX12-NEXT:    scratch_store_b32 off, v0, s1 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_i32 s32, s1, s0 +; GFX12-NEXT:    s_endpgm +; +; GFX942-LABEL: test_alloca_var_uniform: +; GFX942:       ; %bb.0: +; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:    s_lshl_b32 s0, s0, 2 +; GFX942-NEXT:    s_add_i32 s0, s0, 15 +; GFX942-NEXT:    s_mov_b32 s32, 16 +; GFX942-NEXT:    s_and_b32 s0, s0, -16 +; GFX942-NEXT:    v_mov_b32_e32 v0, 0 +; GFX942-NEXT:    s_lshl_b32 s0, s0, 6 +; GFX942-NEXT:    s_mov_b32 s1, s32 +; GFX942-NEXT:    s_add_i32 s32, s1, s0 +; GFX942-NEXT:    scratch_store_dword off, v0, s1 +; GFX942-NEXT:    s_endpgm +  %v = alloca i32, i32 %count, align 4, addrspace(5) +  store i32 0, ptr addrspace(5) %v, align 4 +  ret void +} + +define amdgpu_cs_chain void @test_alloca_var(i32 %count) { +; GFX12-LABEL: test_alloca_var: +; GFX12:       ; %bb.0: +; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:    s_wait_expcnt 0x0 +; GFX12-NEXT:    s_wait_samplecnt 0x0 +; GFX12-NEXT:    s_wait_bvhcnt 0x0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT:    s_mov_b32 s1, exec_lo +; GFX12-NEXT:    s_mov_b32 s0, 0 +; GFX12-NEXT:    s_mov_b32 s32, 16 +; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0 +; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_ctz_i32_b32 s2, s1 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT:    v_readlane_b32 s3, v1, s2 +; GFX12-NEXT:    s_bitset0_b32 s1, s2 +; GFX12-NEXT:    s_max_u32 s0, s0, s3 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_cmp_lg_u32 s1, 0 +; GFX12-NEXT:    s_cbranch_scc1 .LBB2_1 +; GFX12-NEXT:  ; %bb.2: +; GFX12-NEXT:    s_mov_b32 s1, s32 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1 +; GFX12-NEXT:    scratch_store_b32 off, v0, s1 +; GFX12-NEXT:    v_readfirstlane_b32 s32, v1 +; GFX12-NEXT:    s_endpgm +; +; GFX942-LABEL: test_alloca_var: +; GFX942:       ; %bb.0: +; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0 +; GFX942-NEXT:    v_mov_b32_e32 v0, 0 +; GFX942-NEXT:    s_mov_b64 s[0:1], exec +; GFX942-NEXT:    s_mov_b32 s2, 0 +; GFX942-NEXT:    s_mov_b32 s32, 16 +; GFX942-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT:    v_readlane_b32 s4, v1, s3 +; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT:    s_max_u32 s2, s2, s4 +; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT:    s_cbranch_scc1 .LBB2_1 +; GFX942-NEXT:  ; %bb.2: +; GFX942-NEXT:    s_mov_b32 s0, s32 +; GFX942-NEXT:    v_mov_b32_e32 v1, s0 +; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1 +; GFX942-NEXT:    scratch_store_dword off, v0, s0 +; GFX942-NEXT:    v_readfirstlane_b32 s32, v1 +; GFX942-NEXT:    s_endpgm +  %v = alloca i32, i32 %count, align 4, addrspace(5) +  store i32 0, ptr addrspace(5) %v, align 4 +  ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call() { +; GFX12-LABEL: test_alloca_and_call: +; GFX12:       ; %bb.0: ; %.entry +; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:    s_wait_expcnt 0x0 +; GFX12-NEXT:    s_wait_samplecnt 0x0 +; GFX12-NEXT:    s_wait_bvhcnt 0x0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_getpc_b64 s[0:1] +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_sext_i32_i16 s1, s1 +; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 +; GFX12-NEXT:    v_mov_b32_e32 v0, 0 +; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT:    s_mov_b32 s32, 16 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_mov_b32 s2, s32 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_i32 s32, s2, 0x200 +; GFX12-NEXT:    scratch_store_b32 off, v0, s2 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1] +; GFX12-NEXT:    s_endpgm +; +; GFX942-LABEL: test_alloca_and_call: +; GFX942:       ; %bb.0: ; %.entry +; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:    s_getpc_b64 s[0:1] +; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT:    s_mov_b32 s32, 16 +; GFX942-NEXT:    s_mov_b32 s2, s32 +; GFX942-NEXT:    v_mov_b32_e32 v0, 0 +; GFX942-NEXT:    s_add_i32 s32, s2, 0x400 +; GFX942-NEXT:    scratch_store_dword off, v0, s2 +; GFX942-NEXT:    s_waitcnt lgkmcnt(0) +; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT:    s_endpgm +.entry: +  br label %SW_C + +SW_C:                                             ; preds = %.entry +  %v = alloca i32, i32 1, align 4, addrspace(5) +  store i32 0, ptr addrspace(5) %v, align 4 +  call amdgpu_gfx void @foo() +  ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_alloca_and_call_var_uniform: +; GFX12:       ; %bb.0: +; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:    s_wait_expcnt 0x0 +; GFX12-NEXT:    s_wait_samplecnt 0x0 +; GFX12-NEXT:    s_wait_bvhcnt 0x0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_getpc_b64 s[2:3] +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_sext_i32_i16 s3, s3 +; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT:    s_lshl_b32 s0, s0, 2 +; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT:    s_add_co_i32 s0, s0, 15 +; GFX12-NEXT:    v_mov_b32_e32 v0, 0 +; GFX12-NEXT:    s_mov_b32 s32, 16 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_and_b32 s0, s0, -16 +; GFX12-NEXT:    s_mov_b32 s1, s32 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_lshl_b32 s0, s0, 5 +; GFX12-NEXT:    scratch_store_b32 off, v0, s1 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_i32 s32, s1, s0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT:    s_endpgm +; +; GFX942-LABEL: test_alloca_and_call_var_uniform: +; GFX942:       ; %bb.0: +; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:    s_lshl_b32 s0, s0, 2 +; GFX942-NEXT:    s_add_i32 s0, s0, 15 +; GFX942-NEXT:    s_and_b32 s0, s0, -16 +; GFX942-NEXT:    s_lshl_b32 s2, s0, 6 +; GFX942-NEXT:    s_getpc_b64 s[0:1] +; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT:    s_mov_b32 s32, 16 +; GFX942-NEXT:    v_mov_b32_e32 v0, 0 +; GFX942-NEXT:    s_mov_b32 s3, s32 +; GFX942-NEXT:    s_add_i32 s32, s3, s2 +; GFX942-NEXT:    scratch_store_dword off, v0, s3 +; GFX942-NEXT:    s_waitcnt lgkmcnt(0) +; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT:    s_endpgm +  %v = alloca i32, i32 %count, align 4, addrspace(5) +  store i32 0, ptr addrspace(5) %v, align 4 +  call amdgpu_gfx void @foo() +  ret void +} + +define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) { +; GFX12-LABEL: test_alloca_and_call_var: +; GFX12:       ; %bb.0: +; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:    s_wait_expcnt 0x0 +; GFX12-NEXT:    s_wait_samplecnt 0x0 +; GFX12-NEXT:    s_wait_bvhcnt 0x0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT:    s_mov_b32 s1, exec_lo +; GFX12-NEXT:    s_mov_b32 s0, 0 +; GFX12-NEXT:    s_mov_b32 s32, 16 +; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0 +; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_ctz_i32_b32 s2, s1 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT:    v_readlane_b32 s3, v1, s2 +; GFX12-NEXT:    s_bitset0_b32 s1, s2 +; GFX12-NEXT:    s_max_u32 s0, s0, s3 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_cmp_lg_u32 s1, 0 +; GFX12-NEXT:    s_cbranch_scc1 .LBB5_1 +; GFX12-NEXT:  ; %bb.2: +; GFX12-NEXT:    s_getpc_b64 s[2:3] +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_sext_i32_i16 s3, s3 +; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT:    s_mov_b32 s1, s32 +; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1 +; GFX12-NEXT:    scratch_store_b32 off, v0, s1 +; GFX12-NEXT:    v_readfirstlane_b32 s32, v1 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_wait_alu 0xf1ff +; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT:    s_endpgm +; +; GFX942-LABEL: test_alloca_and_call_var: +; GFX942:       ; %bb.0: +; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0 +; GFX942-NEXT:    v_mov_b32_e32 v0, 0 +; GFX942-NEXT:    s_mov_b64 s[0:1], exec +; GFX942-NEXT:    s_mov_b32 s2, 0 +; GFX942-NEXT:    s_mov_b32 s32, 16 +; GFX942-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT:    v_readlane_b32 s4, v1, s3 +; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT:    s_max_u32 s2, s2, s4 +; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1 +; GFX942-NEXT:  ; %bb.2: +; GFX942-NEXT:    s_getpc_b64 s[0:1] +; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT:    s_mov_b32 s3, s32 +; GFX942-NEXT:    v_mov_b32_e32 v1, s3 +; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1 +; GFX942-NEXT:    scratch_store_dword off, v0, s3 +; GFX942-NEXT:    v_readfirstlane_b32 s32, v1 +; GFX942-NEXT:    s_waitcnt lgkmcnt(0) +; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT:    s_endpgm +  %v = alloca i32, i32 %count, align 4, addrspace(5) +  store i32 0, ptr addrspace(5) %v, align 4 +  call amdgpu_gfx void @foo() +  ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca() { +; GFX12-LABEL: test_call_and_alloca: +; GFX12:       ; %bb.0: ; %.entry +; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:    s_wait_expcnt 0x0 +; GFX12-NEXT:    s_wait_samplecnt 0x0 +; GFX12-NEXT:    s_wait_bvhcnt 0x0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_getpc_b64 s[0:1] +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_sext_i32_i16 s1, s1 +; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24 +; GFX12-NEXT:    s_mov_b32 s32, 16 +; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0 +; GFX12-NEXT:    s_mov_b32 s4, s32 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_i32 s32, s4, 0x200 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1] +; GFX12-NEXT:    v_mov_b32_e32 v0, 0 +; GFX12-NEXT:    scratch_store_b32 off, v0, s4 +; GFX12-NEXT:    s_endpgm +; +; GFX942-LABEL: test_call_and_alloca: +; GFX942:       ; %bb.0: ; %.entry +; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:    s_getpc_b64 s[0:1] +; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT:    s_mov_b32 s32, 16 +; GFX942-NEXT:    s_mov_b32 s4, s32 +; GFX942-NEXT:    s_add_i32 s32, s4, 0x400 +; GFX942-NEXT:    s_waitcnt lgkmcnt(0) +; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT:    v_mov_b32_e32 v0, 0 +; GFX942-NEXT:    scratch_store_dword off, v0, s4 +; GFX942-NEXT:    s_endpgm +.entry: +  br label %SW_C + +SW_C:                                             ; preds = %.entry +  %v = alloca i32, i32 1, align 4, addrspace(5) +  call amdgpu_gfx void @foo() +  store i32 0, ptr addrspace(5) %v, align 4 +  ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) { +; GFX12-LABEL: test_call_and_alloca_var_uniform: +; GFX12:       ; %bb.0: +; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:    s_wait_expcnt 0x0 +; GFX12-NEXT:    s_wait_samplecnt 0x0 +; GFX12-NEXT:    s_wait_bvhcnt 0x0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_getpc_b64 s[2:3] +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_sext_i32_i16 s3, s3 +; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT:    s_lshl_b32 s0, s0, 2 +; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT:    s_add_co_i32 s0, s0, 15 +; GFX12-NEXT:    s_mov_b32 s32, 16 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_and_b32 s0, s0, -16 +; GFX12-NEXT:    s_mov_b32 s4, s32 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_lshl_b32 s0, s0, 5 +; GFX12-NEXT:    v_mov_b32_e32 v40, 0 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_i32 s32, s4, s0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT:    scratch_store_b32 off, v40, s4 +; GFX12-NEXT:    s_endpgm +; +; GFX942-LABEL: test_call_and_alloca_var_uniform: +; GFX942:       ; %bb.0: +; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:    s_lshl_b32 s0, s0, 2 +; GFX942-NEXT:    s_add_i32 s0, s0, 15 +; GFX942-NEXT:    s_and_b32 s0, s0, -16 +; GFX942-NEXT:    s_lshl_b32 s2, s0, 6 +; GFX942-NEXT:    s_getpc_b64 s[0:1] +; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT:    s_mov_b32 s32, 16 +; GFX942-NEXT:    s_mov_b32 s4, s32 +; GFX942-NEXT:    v_mov_b32_e32 v40, 0 +; GFX942-NEXT:    s_add_i32 s32, s4, s2 +; GFX942-NEXT:    s_waitcnt lgkmcnt(0) +; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT:    scratch_store_dword off, v40, s4 +; GFX942-NEXT:    s_endpgm +  %v = alloca i32, i32 %count, align 4, addrspace(5) +  call amdgpu_gfx void @foo() +  store i32 0, ptr addrspace(5) %v, align 4 +  ret void +} + +define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) { +; GFX12-LABEL: test_call_and_alloca_var: +; GFX12:       ; %bb.0: +; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:    s_wait_expcnt 0x0 +; GFX12-NEXT:    s_wait_samplecnt 0x0 +; GFX12-NEXT:    s_wait_bvhcnt 0x0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15 +; GFX12-NEXT:    v_mov_b32_e32 v40, 0 +; GFX12-NEXT:    s_mov_b32 s1, exec_lo +; GFX12-NEXT:    s_mov_b32 s0, 0 +; GFX12-NEXT:    s_mov_b32 s32, 16 +; GFX12-NEXT:    v_and_b32_e32 v0, -16, v0 +; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_ctz_i32_b32 s2, s1 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT:    v_readlane_b32 s3, v0, s2 +; GFX12-NEXT:    s_bitset0_b32 s1, s2 +; GFX12-NEXT:    s_max_u32 s0, s0, s3 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_cmp_lg_u32 s1, 0 +; GFX12-NEXT:    s_cbranch_scc1 .LBB8_1 +; GFX12-NEXT:  ; %bb.2: +; GFX12-NEXT:    s_getpc_b64 s[2:3] +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_sext_i32_i16 s3, s3 +; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12 +; GFX12-NEXT:    s_wait_alu 0xfffe +; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24 +; GFX12-NEXT:    s_mov_b32 s4, s32 +; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-NEXT:    v_lshl_add_u32 v0, s0, 5, s4 +; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT:    v_readfirstlane_b32 s32, v0 +; GFX12-NEXT:    s_wait_kmcnt 0x0 +; GFX12-NEXT:    s_wait_alu 0xf1ff +; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3] +; GFX12-NEXT:    scratch_store_b32 off, v40, s4 +; GFX12-NEXT:    s_endpgm +; +; GFX942-LABEL: test_call_and_alloca_var: +; GFX942:       ; %bb.0: +; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15 +; GFX942-NEXT:    v_and_b32_e32 v0, -16, v0 +; GFX942-NEXT:    v_mov_b32_e32 v40, 0 +; GFX942-NEXT:    s_mov_b64 s[0:1], exec +; GFX942-NEXT:    s_mov_b32 s2, 0 +; GFX942-NEXT:    s_mov_b32 s32, 16 +; GFX942-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1 +; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1] +; GFX942-NEXT:    v_readlane_b32 s4, v0, s3 +; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3 +; GFX942-NEXT:    s_max_u32 s2, s2, s4 +; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0 +; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1 +; GFX942-NEXT:  ; %bb.2: +; GFX942-NEXT:    s_getpc_b64 s[0:1] +; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4 +; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 +; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX942-NEXT:    s_mov_b32 s4, s32 +; GFX942-NEXT:    v_mov_b32_e32 v0, s4 +; GFX942-NEXT:    v_lshl_add_u32 v0, s2, 6, v0 +; GFX942-NEXT:    s_nop 0 +; GFX942-NEXT:    v_readfirstlane_b32 s32, v0 +; GFX942-NEXT:    s_waitcnt lgkmcnt(0) +; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1] +; GFX942-NEXT:    scratch_store_dword off, v40, s4 +; GFX942-NEXT:    s_endpgm +  %v = alloca i32, i32 %count, align 4, addrspace(5) +  call amdgpu_gfx void @foo() +  store i32 0, ptr addrspace(5) %v, align 4 +  ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll index f6ae516..89d0394 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll @@ -1489,7 +1489,7 @@ attributes #2 = { noinline }  !0 = !{float 3.0}  ;.  ; CHECK: attributes #[[ATTR0]] = { strictfp } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }  ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) }  ; CHECK: attributes #[[ATTR3]] = { noinline }  ; CHECK: attributes #[[ATTR4]] = { nobuiltin } diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll new file mode 100644 index 0000000..253a6ec --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/callbr.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s + +define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) { +; CHECK-LABEL: callbr_inline_asm: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT:    flat_load_dword v0, v[0:1] +; CHECK-NEXT:    ;;#ASMSTART +; CHECK-NEXT:    v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2 +; CHECK-NEXT:    ;;#ASMEND +; CHECK-NEXT:  ; %bb.1: ; %fallthrough +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT:    flat_store_dword v[2:3], v0 +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT:    s_setpc_b64 s[30:31] +; CHECK-NEXT:  .LBB0_2: ; Inline asm indirect target +; CHECK-NEXT:    ; %indirect +; CHECK-NEXT:    ; Label of block must be emitted +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT:    flat_store_dword v[4:5], v0 +; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT:    s_setpc_b64 s[30:31] +	%a = load i32, ptr %src, align 4 +	callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect] +fallthrough: +	store i32 %a, ptr %dst1, align 4 +	br label %ret +indirect: +	store i32 %a, ptr %dst2, align 4 +	br label %ret +ret: +	ret void +} + +define void @callbr_self_loop(i1 %c) { +; CHECK-LABEL: callbr_self_loop: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT:  .LBB1_1: ; %callbr +; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT:    ;;#ASMSTART +; CHECK-NEXT:    ;;#ASMEND +; CHECK-NEXT:    s_branch .LBB1_1 +; CHECK-NEXT:  .LBB1_2: ; Inline asm indirect target +; CHECK-NEXT:    ; %callbr.target.ret +; CHECK-NEXT:    ; Label of block must be emitted +; CHECK-NEXT:    s_setpc_b64 s[30:31] +  br label %callbr +callbr: +  callbr void asm "", "!i"() to label %callbr [label %ret] +ret: +  ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll index 007e3f0..076a99f 100644 --- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll +++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll @@ -3,6 +3,7 @@  declare void @foo(ptr)  declare i1 @bar(ptr) +declare i32 @bar32(ptr)  define void @musttail_call_without_return_value(ptr %p) {  ; CHECK-LABEL: define void @musttail_call_without_return_value( @@ -28,6 +29,31 @@ bb.1:    ret void  } +define void @musttail_call_without_return_value_callbr(ptr %p) { +; CHECK-LABEL: define void @musttail_call_without_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT:  [[ENTRY:.*:]] +; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1] +; CHECK:       [[BB_0]]: +; CHECK-NEXT:    musttail call void @foo(ptr [[P]]) +; CHECK-NEXT:    ret void +; CHECK:       [[BB_1:.*:]] +; CHECK-NEXT:    ret void +; +entry: +  %load = load i32, ptr %p, align 1 +  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: +  musttail call void @foo(ptr %p) +  ret void + +bb.1: +  ret void +} +  define i1 @musttail_call_with_return_value(ptr %p) {  ; CHECK-LABEL: define i1 @musttail_call_with_return_value(  ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { @@ -51,3 +77,28 @@ bb.0:  bb.1:    ret i1 %load  } + +define i32 @musttail_call_with_return_value_callbr(ptr %p) { +; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] { +; CHECK-NEXT:  [[ENTRY:.*:]] +; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1 +; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]]) +; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1] +; CHECK:       [[BB_0]]: +; CHECK-NEXT:    [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]]) +; CHECK-NEXT:    ret i32 [[RET]] +; CHECK:       [[BB_1:.*:]] +; CHECK-NEXT:    ret i32 [[LOAD]] +; +entry: +  %load = load i32, ptr %p, align 1 +  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1] + +bb.0: +  %ret = musttail call i32 @bar32(ptr %p) +  ret i32 %ret + +bb.1: +  ret i32 %load +} diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll index 3e2e43f..df63592 100644 --- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll @@ -36,26 +36,60 @@ loop:    br label %loop  } +define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_callbr: +; SI:       ; %bb.0: ; %entry +; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT:    ;;#ASMSTART +; SI-NEXT:    ;;#ASMEND +; SI-NEXT:    s_mov_b32 s3, 0xf000 +; SI-NEXT:    s_mov_b32 s2, -1 +; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT:    s_waitcnt lgkmcnt(0) +; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT:    s_waitcnt vmcnt(0) +; SI-NEXT:    s_endpgm +; IR-LABEL: @infinite_loop_callbr( +; IR-NEXT:  entry: +; IR-NEXT:    callbr void asm "", ""() +; IR-NEXT:            to label [[LOOP:%.*]] [] +; IR:       loop: +; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR:       TransitionBlock: +; IR-NEXT:    callbr void asm "", ""() +; IR-NEXT:            to label [[LOOP]] [] +; IR:       DummyReturnBlock: +; IR-NEXT:    ret void +; +entry: +  callbr void asm "", ""() to label %loop [] + +loop: +  store volatile i32 999, ptr addrspace(1) %out, align 4 +  callbr void asm "", ""() to label %loop [] +} +  define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {  ; SI-LABEL: infinite_loop_ret:  ; SI:       ; %bb.0: ; %entry  ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0  ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT:    s_cbranch_execz .LBB1_3 +; SI-NEXT:    s_cbranch_execz .LBB2_3  ; SI-NEXT:  ; %bb.1: ; %loop.preheader  ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9  ; SI-NEXT:    s_mov_b32 s3, 0xf000  ; SI-NEXT:    s_mov_b32 s2, -1  ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7  ; SI-NEXT:    s_and_b64 vcc, exec, -1 -; SI-NEXT:  .LBB1_2: ; %loop +; SI-NEXT:  .LBB2_2: ; %loop  ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1  ; SI-NEXT:    s_waitcnt lgkmcnt(0)  ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0  ; SI-NEXT:    s_waitcnt vmcnt(0)  ; SI-NEXT:    s_mov_b64 vcc, vcc -; SI-NEXT:    s_cbranch_vccnz .LBB1_2 -; SI-NEXT:  .LBB1_3: ; %UnifiedReturnBlock +; SI-NEXT:    s_cbranch_vccnz .LBB2_2 +; SI-NEXT:  .LBB2_3: ; %UnifiedReturnBlock  ; SI-NEXT:    s_endpgm  ; IR-LABEL: @infinite_loop_ret(  ; IR-NEXT:  entry: @@ -81,44 +115,93 @@ return:    ret void  } +define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_ret_callbr: +; SI:       ; %bb.0: ; %entry +; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT:    ;;#ASMSTART +; SI-NEXT:    ;;#ASMEND +; SI-NEXT:  ; %bb.1: ; %loop.preheader +; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT:    s_mov_b32 s3, 0xf000 +; SI-NEXT:    s_mov_b32 s2, -1 +; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT:    s_waitcnt lgkmcnt(0) +; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT:    s_waitcnt vmcnt(0) +; SI-NEXT:  .LBB3_2: ; Inline asm indirect target +; SI-NEXT:    ; %UnifiedReturnBlock +; SI-NEXT:    ; Label of block must be emitted +; SI-NEXT:    s_endpgm +; IR-LABEL: @infinite_loop_ret_callbr( +; IR-NEXT:  entry: +; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1 +; IR-NEXT:    [[COND32:%.*]] = zext i1 [[COND]] to i32 +; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND32]]) +; IR-NEXT:            to label [[LOOP:%.*]] [label %UnifiedReturnBlock] +; IR:       loop: +; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR:       TransitionBlock: +; IR-NEXT:    callbr void asm "", ""() +; IR-NEXT:            to label [[LOOP]] [] +; IR:       UnifiedReturnBlock: +; IR-NEXT:    ret void +; +entry: +  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() +  %cond = icmp eq i32 %tmp, 1 +  %cond32 = zext i1 %cond to i32 +  callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return] + +loop: +  store volatile i32 999, ptr addrspace(1) %out, align 4 +  callbr void asm "", ""() to label %loop [] + +return: +  ret void +} +  define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {  ; SI-LABEL: infinite_loops:  ; SI:       ; %bb.0: ; %entry  ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9  ; SI-NEXT:    s_mov_b64 s[2:3], -1 -; SI-NEXT:    s_cbranch_scc1 .LBB2_4 +; SI-NEXT:    s_cbranch_scc1 .LBB4_4  ; SI-NEXT:  ; %bb.1:  ; SI-NEXT:    s_mov_b32 s3, 0xf000  ; SI-NEXT:    s_mov_b32 s2, -1  ; SI-NEXT:    v_mov_b32_e32 v0, 0x378  ; SI-NEXT:    s_and_b64 vcc, exec, -1 -; SI-NEXT:  .LBB2_2: ; %loop2 +; SI-NEXT:  .LBB4_2: ; %loop2  ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1  ; SI-NEXT:    s_waitcnt lgkmcnt(0)  ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0  ; SI-NEXT:    s_waitcnt vmcnt(0)  ; SI-NEXT:    s_mov_b64 vcc, vcc -; SI-NEXT:    s_cbranch_vccnz .LBB2_2 +; SI-NEXT:    s_cbranch_vccnz .LBB4_2  ; SI-NEXT:  ; %bb.3: ; %Flow  ; SI-NEXT:    s_mov_b64 s[2:3], 0 -; SI-NEXT:  .LBB2_4: ; %Flow2 +; SI-NEXT:  .LBB4_4: ; %Flow2  ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]  ; SI-NEXT:    s_waitcnt lgkmcnt(0)  ; SI-NEXT:    s_mov_b64 vcc, vcc -; SI-NEXT:    s_cbranch_vccz .LBB2_7 +; SI-NEXT:    s_cbranch_vccz .LBB4_7  ; SI-NEXT:  ; %bb.5:  ; SI-NEXT:    s_mov_b32 s3, 0xf000  ; SI-NEXT:    s_mov_b32 s2, -1  ; SI-NEXT:    s_waitcnt expcnt(0)  ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7  ; SI-NEXT:    s_and_b64 vcc, exec, 0 -; SI-NEXT:  .LBB2_6: ; %loop1 +; SI-NEXT:  .LBB4_6: ; %loop1  ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1  ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0  ; SI-NEXT:    s_waitcnt vmcnt(0)  ; SI-NEXT:    s_mov_b64 vcc, vcc -; SI-NEXT:    s_cbranch_vccz .LBB2_6 -; SI-NEXT:  .LBB2_7: ; %DummyReturnBlock +; SI-NEXT:    s_cbranch_vccz .LBB4_6 +; SI-NEXT:  .LBB4_7: ; %DummyReturnBlock  ; SI-NEXT:    s_endpgm  ; IR-LABEL: @infinite_loops(  ; IR-NEXT:  entry: @@ -144,24 +227,78 @@ loop2:    br label %loop2  } +define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loops_callbr: +; SI:       ; %bb.0: ; %entry +; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT:    s_waitcnt lgkmcnt(0) +; SI-NEXT:    ;;#ASMSTART +; SI-NEXT:    ;;#ASMEND +; SI-NEXT:  ; %bb.1: ; %loop1 +; SI-NEXT:    s_mov_b32 s3, 0xf000 +; SI-NEXT:    s_mov_b32 s2, -1 +; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT:    s_waitcnt vmcnt(0) +; SI-NEXT:    s_endpgm +; SI-NEXT:  .LBB5_2: ; Inline asm indirect target +; SI-NEXT:    ; %loop2.preheader +; SI-NEXT:    ; Label of block must be emitted +; SI-NEXT:    s_mov_b32 s3, 0xf000 +; SI-NEXT:    s_mov_b32 s2, -1 +; SI-NEXT:    v_mov_b32_e32 v0, 0x378 +; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT:    s_waitcnt vmcnt(0) +; SI-NEXT:    s_endpgm +; IR-LABEL: @infinite_loops_callbr( +; IR-NEXT:  entry: +; IR-NEXT:    callbr void asm "", "r,!i"(i32 poison) +; IR-NEXT:            to label [[LOOP1:%.*]] [label %loop2] +; IR:       loop1: +; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]] +; IR:       TransitionBlock: +; IR-NEXT:    callbr void asm "", ""() +; IR-NEXT:            to label [[LOOP1]] [] +; IR:       loop2: +; IR-NEXT:    store volatile i32 888, ptr addrspace(1) [[OUT]], align 4 +; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]] +; IR:       TransitionBlock1: +; IR-NEXT:    callbr void asm "", ""() +; IR-NEXT:            to label [[LOOP2:%.*]] [] +; IR:       DummyReturnBlock: +; IR-NEXT:    ret void +; +entry: +  callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2] + +loop1: +  store volatile i32 999, ptr addrspace(1) %out, align 4 +  callbr void asm "", ""() to label %loop1 [] + +loop2: +  store volatile i32 888, ptr addrspace(1) %out, align 4 +  callbr void asm "", ""() to label %loop2 [] +} +  define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {  ; SI-LABEL: infinite_loop_nest_ret:  ; SI:       ; %bb.0: ; %entry  ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0  ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc -; SI-NEXT:    s_cbranch_execz .LBB3_5 +; SI-NEXT:    s_cbranch_execz .LBB6_5  ; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader  ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9  ; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0  ; SI-NEXT:    s_mov_b32 s7, 0xf000  ; SI-NEXT:    s_mov_b32 s6, -1  ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7 -; SI-NEXT:  .LBB3_2: ; %outer_loop +; SI-NEXT:  .LBB6_2: ; %outer_loop  ; SI-NEXT:    ; =>This Loop Header: Depth=1 -; SI-NEXT:    ; Child Loop BB3_3 Depth 2 +; SI-NEXT:    ; Child Loop BB6_3 Depth 2  ; SI-NEXT:    s_mov_b64 s[2:3], 0 -; SI-NEXT:  .LBB3_3: ; %inner_loop -; SI-NEXT:    ; Parent Loop BB3_2 Depth=1 +; SI-NEXT:  .LBB6_3: ; %inner_loop +; SI-NEXT:    ; Parent Loop BB6_2 Depth=1  ; SI-NEXT:    ; => This Inner Loop Header: Depth=2  ; SI-NEXT:    s_and_b64 s[8:9], exec, s[0:1]  ; SI-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3] @@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {  ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0  ; SI-NEXT:    s_waitcnt vmcnt(0)  ; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3] -; SI-NEXT:    s_cbranch_execnz .LBB3_3 +; SI-NEXT:    s_cbranch_execnz .LBB6_3  ; SI-NEXT:  ; %bb.4: ; %loop.exit.guard -; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1 +; SI-NEXT:    ; in Loop: Header=BB6_2 Depth=1  ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]  ; SI-NEXT:    s_mov_b64 vcc, 0 -; SI-NEXT:    s_branch .LBB3_2 -; SI-NEXT:  .LBB3_5: ; %UnifiedReturnBlock +; SI-NEXT:    s_branch .LBB6_2 +; SI-NEXT:  .LBB6_5: ; %UnifiedReturnBlock  ; SI-NEXT:    s_endpgm  ; IR-LABEL: @infinite_loop_nest_ret(  ; IR-NEXT:  entry: @@ -212,4 +349,82 @@ return:    ret void  } +define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) { +; SI-LABEL: infinite_loop_nest_ret_callbr: +; SI:       ; %bb.0: ; %entry +; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0 +; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT:    ;;#ASMSTART +; SI-NEXT:    ;;#ASMEND +; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader +; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9 +; SI-NEXT:    s_mov_b32 s7, 0xf000 +; SI-NEXT:    s_mov_b32 s6, -1 +; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7 +; SI-NEXT:    s_and_b64 s[0:1], exec, 0 +; SI-NEXT:    s_branch .LBB7_3 +; SI-NEXT:  .LBB7_2: ; %loop.exit.guard +; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT:    s_and_b64 vcc, exec, s[2:3] +; SI-NEXT:    s_cbranch_vccnz .LBB7_5 +; SI-NEXT:  .LBB7_3: ; %outer_loop +; SI-NEXT:    ; =>This Inner Loop Header: Depth=1 +; SI-NEXT:    ;;#ASMSTART +; SI-NEXT:    ;;#ASMEND +; SI-NEXT:    s_waitcnt lgkmcnt(0) +; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT:    s_waitcnt vmcnt(0) +; SI-NEXT:    s_mov_b64 s[2:3], -1 +; SI-NEXT:    s_mov_b64 vcc, s[0:1] +; SI-NEXT:    s_cbranch_vccz .LBB7_2 +; SI-NEXT:  ; %bb.4: ; %TransitionBlock.target.outer_loop +; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1 +; SI-NEXT:    s_mov_b64 s[2:3], 0 +; SI-NEXT:    s_branch .LBB7_2 +; SI-NEXT:  .LBB7_5: ; Inline asm indirect target +; SI-NEXT:    ; %UnifiedReturnBlock +; SI-NEXT:    ; Label of block must be emitted +; SI-NEXT:    s_endpgm +; IR-LABEL: @infinite_loop_nest_ret_callbr( +; IR-NEXT:  entry: +; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT:    [[COND1:%.*]] = icmp ne i32 [[TMP]], 1 +; IR-NEXT:    [[COND1_32:%.*]] = zext i1 [[COND1]] to i32 +; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND1_32]]) +; IR-NEXT:            to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock] +; IR:       outer_loop: +; IR-NEXT:    callbr void asm "", ""() +; IR-NEXT:            to label [[INNER_LOOP:%.*]] [] +; IR:       inner_loop: +; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4 +; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3 +; IR-NEXT:    [[COND3_32:%.*]] = zext i1 [[COND3]] to i32 +; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]] +; IR:       TransitionBlock: +; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND3_32]]) +; IR-NEXT:            to label [[INNER_LOOP]] [label %outer_loop] +; IR:       UnifiedReturnBlock: +; IR-NEXT:    ret void +; +entry: +  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() +  %cond1 = icmp ne i32 %tmp, 1  ; avoid following BB optimizing away through the domination +  %cond1_32 = zext i1 %cond1 to i32 +  callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return] + +outer_loop: +  ; %cond2 = icmp eq i32 %tmp, 2 +  ; br i1 %cond2, label %outer_loop, label %inner_loop +  callbr void asm "", ""() to label %inner_loop [] + +inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1 +  store volatile i32 999, ptr addrspace(1) %out, align 4 +  %cond3 = icmp eq i32 %tmp, 3 +  %cond3_32 = zext i1 %cond3 to i32 +  callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop] + +return: +  ret void +} +  declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/private-function.ll b/llvm/test/CodeGen/AMDGPU/private-function.ll new file mode 100644 index 0000000..8eefc9d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/private-function.ll @@ -0,0 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s + +define private void @foo() { +; CHECK-LABEL: foo: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0 +; CHECK-NEXT:    s_wait_expcnt 0x0 +; CHECK-NEXT:    s_wait_samplecnt 0x0 +; CHECK-NEXT:    s_wait_bvhcnt 0x0 +; CHECK-NEXT:    s_wait_kmcnt 0x0 +; CHECK-NEXT:    s_setpc_b64 s[30:31] +  ret void +} + +@var = global ptr @foo diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir index 002d43f..1316569 100644 --- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir +++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir @@ -1,5 +1,6 @@  # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py  # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX  # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX  --- @@ -41,6 +42,27 @@ body:             |  ...  --- +name:            meta_in_between +body:             | +  bb.0: +    ; GCN-LABEL: name: meta_in_between +    ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode +    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +    ; GCN-NEXT: KILL $sgpr0 +    ; GCN-NEXT: $sgpr0 = IMPLICIT_DEF +    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode +  S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode +  $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode +  KILL $sgpr0 +  $sgpr0 = IMPLICIT_DEF +  S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode +  $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode +... + +---  name:            valu_write_in_between  body:             |    bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 34de1e4..01bcdad 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -3,15 +3,16 @@  ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA  define void @nested_inf_loop(i1 %0, i1 %1) { -; OPT-LABEL: @nested_inf_loop( -; OPT-NEXT:  BB: -; OPT-NEXT:    br label [[BB1:%.*]] -; OPT:       BB1: -; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]] -; OPT-NEXT:    br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]] -; OPT:       infloop: -; OPT-NEXT:    br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]] -; OPT:       DummyReturnBlock: +; OPT-LABEL: define void @nested_inf_loop( +; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) { +; OPT-NEXT:  [[BB:.*:]] +; OPT-NEXT:    br label %[[BB1:.*]] +; OPT:       [[BB1]]: +; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]] +; OPT-NEXT:    br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]] +; OPT:       [[INFLOOP]]: +; OPT-NEXT:    br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT:       [[DUMMYRETURNBLOCK]]:  ; OPT-NEXT:    ret void  ;  ; ISA-LABEL: nested_inf_loop: @@ -63,3 +64,84 @@ BB4:  BB3:    br label %BB1  } + +define void @nested_inf_loop_callbr(i32 %0, i32 %1) { +; OPT-LABEL: define void @nested_inf_loop_callbr( +; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +; OPT-NEXT:  [[BB:.*:]] +; OPT-NEXT:    callbr void asm "", ""() +; OPT-NEXT:            to label %[[BB1:.*]] [] +; OPT:       [[BB1]]: +; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP0]]) +; OPT-NEXT:            to label %[[BB3:.*]] [label %BB2] +; OPT:       [[BB2:.*:]] +; OPT-NEXT:    callbr void asm "", ""() +; OPT-NEXT:            to label %[[BB4:.*]] [] +; OPT:       [[BB4]]: +; OPT-NEXT:    br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]] +; OPT:       [[TRANSITIONBLOCK]]: +; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP1]]) +; OPT-NEXT:            to label %[[BB3]] [label %BB4] +; OPT:       [[BB3]]: +; OPT-NEXT:    callbr void asm "", ""() +; OPT-NEXT:            to label %[[BB1]] [] +; OPT:       [[DUMMYRETURNBLOCK]]: +; OPT-NEXT:    ret void +; +; ISA-LABEL: nested_inf_loop_callbr: +; ISA:       ; %bb.0: ; %BB +; ISA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; ISA-NEXT:    ;;#ASMSTART +; ISA-NEXT:    ;;#ASMEND +; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7 +; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5 +; ISA-NEXT:  .LBB1_1: ; %BB1 +; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1 +; ISA-NEXT:    ;;#ASMSTART +; ISA-NEXT:    ;;#ASMEND +; ISA-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec +; ISA-NEXT:    s_and_b64 s[8:9], s[4:5], exec +; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9] +; ISA-NEXT:  .LBB1_2: ; %BB3 +; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT:    ;;#ASMSTART +; ISA-NEXT:    ;;#ASMEND +; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec +; ISA-NEXT:    s_and_b64 s[8:9], s[6:7], exec +; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9] +; ISA-NEXT:    s_branch .LBB1_1 +; ISA-NEXT:  .LBB1_3: ; Inline asm indirect target +; ISA-NEXT:    ; %BB2 +; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT:    ; Label of block must be emitted +; ISA-NEXT:    ;;#ASMSTART +; ISA-NEXT:    ;;#ASMEND +; ISA-NEXT:    s_mov_b64 s[6:7], -1 +; ISA-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5] +; ISA-NEXT:    s_cbranch_execz .LBB1_5 +; ISA-NEXT:  ; %bb.4: ; %TransitionBlock.target.BB3 +; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1 +; ISA-NEXT:  .LBB1_5: ; %loop.exit.guard +; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1 +; ISA-NEXT:    s_or_b64 exec, exec, s[8:9] +; ISA-NEXT:    s_and_b64 vcc, exec, s[6:7] +; ISA-NEXT:    s_mov_b64 s[6:7], 0 +; ISA-NEXT:    s_cbranch_vccz .LBB1_2 +; ISA-NEXT:  ; %bb.6: ; %DummyReturnBlock +; ISA-NEXT:    s_setpc_b64 s[30:31] +BB: +  callbr void asm "", ""() to label %BB1 [] + +BB1: +  callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2] + +BB2: +  callbr void asm "", ""() to label %BB4 [] + +BB4: +  callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4] + +BB3: +  callbr void asm "", ""() to label %BB1 [] +} diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll index 4cbe682..004c279 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll @@ -1,5 +1,5 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY  ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s  declare void @llvm.trap() @@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {  ; CHECK-NEXT:    s_mov_b64 s[2:3], -1  ; CHECK-NEXT:    s_trap 2  ; CHECK-NEXT:    s_branch .LBB0_4 - - +; UNIFY-LABEL: @kernel( +; UNIFY-NEXT:  entry: +; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; UNIFY:       if.then: +; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT:    br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]] +; UNIFY:       cond.false: +; UNIFY-NEXT:    call void @llvm.trap() +; UNIFY-NEXT:    unreachable +; UNIFY:       if.else: +; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT:    br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]] +; UNIFY:       if.then3: +; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT:    br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]] +; UNIFY:       cond.false.i8: +; UNIFY-NEXT:    call void @llvm.trap() +; UNIFY-NEXT:    unreachable +; UNIFY:       if.end6.sink.split: +; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT:    br label [[IF_END6]] +; UNIFY:       if.end6: +; UNIFY-NEXT:    ret void +;  entry:    %tid = call i32 @llvm.amdgcn.workitem.id.x()    %cmp = icmp eq i32 %n, 256 @@ -105,5 +130,129 @@ if.end6.sink.split:  if.end6:    ret void  } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; UNIFY: {{.*}} + +define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) { +; CHECK-LABEL: kernel_callbr: +; CHECK:       ; %bb.0: ; %entry +; CHECK-NEXT:    s_load_dword s1, s[8:9], 0x10 +; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x0 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    s_cmpk_eq_i32 s1, 0x100 +; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT:    ;;#ASMSTART +; CHECK-NEXT:    ;;#ASMEND +; CHECK-NEXT:  ; %bb.1: ; %if.then +; CHECK-NEXT:    s_cmp_eq_u32 s0, 0 +; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT:    ;;#ASMSTART +; CHECK-NEXT:    ;;#ASMEND +; CHECK-NEXT:  .LBB1_2: ; %if.end6.sink.split +; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x8 +; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT:    v_mov_b32_e32 v1, s0 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    global_store_dword v0, v1, s[2:3] +; CHECK-NEXT:    ;;#ASMSTART +; CHECK-NEXT:    ;;#ASMEND +; CHECK-NEXT:  .LBB1_3: ; Inline asm indirect target +; CHECK-NEXT:    ; %UnifiedReturnBlock +; CHECK-NEXT:    ; Label of block must be emitted +; CHECK-NEXT:    s_endpgm +; CHECK-NEXT:  .LBB1_4: ; Inline asm indirect target +; CHECK-NEXT:    ; %if.else +; CHECK-NEXT:    ; Label of block must be emitted +; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 10, v0 +; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc +; CHECK-NEXT:    ;;#ASMSTART +; CHECK-NEXT:    ;;#ASMEND +; CHECK-NEXT:  ; %bb.5: ; %if.then3 +; CHECK-NEXT:    s_cmp_eq_u32 s0, 0 +; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT:    ;;#ASMSTART +; CHECK-NEXT:    ;;#ASMEND +; CHECK-NEXT:    s_branch .LBB1_2 +; CHECK-NEXT:  .LBB1_6: ; Inline asm indirect target +; CHECK-NEXT:    ; %cond.false.i8 +; CHECK-NEXT:    ; Label of block must be emitted +; CHECK-NEXT:  .LBB1_7: ; Inline asm indirect target +; CHECK-NEXT:    ; %cond.false +; CHECK-NEXT:    ; Label of block must be emitted +; CHECK-NEXT:    s_trap 2 +; CHECK-NEXT:    ; divergent unreachable +; CHECK-NEXT:    s_branch .LBB1_3 +; UNIFY-LABEL: @kernel_callbr( +; UNIFY-NEXT:  entry: +; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256 +; UNIFY-NEXT:    [[CMP32:%.*]] = zext i1 [[CMP]] to i32 +; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP32]]) +; UNIFY-NEXT:            to label [[IF_THEN:%.*]] [label %if.else] +; UNIFY:       if.then: +; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0 +; UNIFY-NEXT:    [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32 +; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_32]]) +; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false] +; UNIFY:       cond.false: +; UNIFY-NEXT:    call void @llvm.trap() +; UNIFY-NEXT:    unreachable +; UNIFY:       if.else: +; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10 +; UNIFY-NEXT:    [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32 +; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP2_32]]) +; UNIFY-NEXT:            to label [[IF_THEN3:%.*]] [label %if.end6] +; UNIFY:       if.then3: +; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0 +; UNIFY-NEXT:    [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32 +; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]]) +; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8] +; UNIFY:       cond.false.i8: +; UNIFY-NEXT:    call void @llvm.trap() +; UNIFY-NEXT:    unreachable +; UNIFY:       if.end6.sink.split: +; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]] +; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4 +; UNIFY-NEXT:    callbr void asm "", ""() +; UNIFY-NEXT:            to label [[IF_END6:%.*]] [] +; UNIFY:       if.end6: +; UNIFY-NEXT:    ret void +; +entry: +  %tid = call i32 @llvm.amdgcn.workitem.id.x() +  %cmp = icmp eq i32 %n, 256 +  %cmp32 = zext i1 %cmp to i32 +  callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else] + +if.then: +  %cmp1 = icmp eq i32 %a, 0 +  %cmp1_32 = zext i1 %cmp1 to i32 +  callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false] + +cond.false: +  call void @llvm.trap() +  unreachable + +if.else: +  %cmp2 = icmp ult i32 %tid, 10 +  %cmp2_32 = zext i1 %cmp2 to i32 +  callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6] + +if.then3: +  %cmp1.i7 = icmp eq i32 %a, 0 +  %cmp1.i7_32 = zext i1 %cmp1.i7 to i32 +  callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8] + +cond.false.i8: +  call void @llvm.trap() +  unreachable + +if.end6.sink.split: +  %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid +  store i32 %a, ptr addrspace(1) %x1, align 4 +  callbr void asm "", ""() to label %if.end6 [] + +if.end6: +  ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll index 50666be..684dc1a 100644 --- a/llvm/test/CodeGen/AMDGPU/update-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll @@ -37,3 +37,42 @@ n28:                                               ; preds = %.loopexit, %n28  n31:                                               ; preds =    ret void  } + +define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 { +; IR-LABEL: @_amdgpu_ps_main_callbr( +; IR-NEXT:  .entry: +; IR-NEXT:    callbr void asm "", ""() +; IR-NEXT:            to label [[DOTLOOPEXIT:%.*]] [] +; IR:       .loopexit: +; IR-NEXT:    callbr void asm "", ""() +; IR-NEXT:            to label [[N28:%.*]] [] +; IR:       n28: +; IR-NEXT:    [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ] +; IR-NEXT:    [[N29]] = fadd float [[DOT01]], 1.000000e+00 +; IR-NEXT:    [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00 +; IR-NEXT:    [[N30_32:%.*]] = zext i1 [[N30]] to i32 +; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]] +; IR:       TransitionBlock: +; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[N30_32]]) +; IR-NEXT:            to label [[DOTLOOPEXIT]] [label %n28] +; IR:       n31: +; IR-NEXT:    ret void +; IR:       DummyReturnBlock: +; IR-NEXT:    ret void +; +.entry: +  callbr void asm "", ""() to label %.loopexit [] + +.loopexit:                                        ; preds = %n28, %.entry +  callbr void asm "", ""() to label %n28 [] + +n28:                                               ; preds = %.loopexit, %n28 +  %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ] +  %n29 = fadd float %.01, 1.0 +  %n30 = fcmp ogt float %n29, 4.000000e+00 +  %n30.32 = zext i1 %n30 to i32 +  callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28] + +n31:                                               ; preds = +  ret void +}  | 
