diff options
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll | 173 | 
1 files changed, 173 insertions, 0 deletions
| diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll new file mode 100644 index 0000000..34d4c51 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-miscellaneous-uniform-intrinsic.ll @@ -0,0 +1,173 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100  -o - %s | FileCheck %s +define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readfirstlane_with_readfirstlane: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT:    s_endpgm +  %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5) +  %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) +  store i32 %v2, ptr addrspace(1) %out +  ret void +} + +define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readfirstlane_with_readlane: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT:    v_bfe_u32 v1, v0, 10, 10 +; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT:    v_readfirstlane_b32 s2, v1 +; CHECK-NEXT:    v_readlane_b32 s2, v0, s2 +; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT:    s_endpgm +  %tidx = call i32 @llvm.amdgcn.workitem.id.x() +  %tidy = call i32 @llvm.amdgcn.workitem.id.y() +  %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) +  %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1) +  store i32 %v2, ptr addrspace(1) %out +  ret void +} + +define amdgpu_kernel void @readlane_with_firstlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readlane_with_firstlane: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT:    v_readfirstlane_b32 s2, v0 +; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT:    s_endpgm +  %tidx = call i32 @llvm.amdgcn.workitem.id.x() +  %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %tidx) +  %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 3) +  store i32 %v2, ptr addrspace(1) %out +  ret void +} + +define amdgpu_kernel void @readlane_readlane(ptr addrspace(1) %out) { +; CHECK-LABEL: readlane_readlane: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT:    v_bfe_u32 v1, v0, 10, 10 +; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT:    v_readfirstlane_b32 s2, v1 +; CHECK-NEXT:    v_readlane_b32 s2, v0, s2 +; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT:    s_endpgm +  %tidx = call i32 @llvm.amdgcn.workitem.id.x() +  %tidy = call i32 @llvm.amdgcn.workitem.id.y() +  %v1 = call i32 @llvm.amdgcn.readlane(i32 %tidx, i32 %tidy) +  %v2 = call i32 @llvm.amdgcn.readlane(i32 %v1, i32 2) +  store i32 %v2, ptr addrspace(1) %out +  ret void +} + +define amdgpu_kernel void @permlane64_uniform(ptr addrspace(1) %out, i32 %src) { +; CHECK-LABEL: permlane64_uniform: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_clause 0x1 +; CHECK-NEXT:    s_load_b32 s2, s[4:5], 0x8 +; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 +; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT:    s_endpgm +  %v = call i32 @llvm.amdgcn.permlane64(i32 %src) +  store i32 %v, ptr addrspace(1) %out +  ret void +} + +define amdgpu_kernel void @permlane64_nonuniform(i32 addrspace(1)* %out) { +; CHECK-LABEL: permlane64_nonuniform: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) +; CHECK-NEXT:    v_permlane64_b32 v1, v0 +; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT:    s_endpgm +  %tid = call i32 @llvm.amdgcn.workitem.id.x() +  %v = call i32 @llvm.amdgcn.permlane64(i32 %tid) +  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid +  store i32 %v, i32 addrspace(1)* %out_ptr +  ret void +} + +define amdgpu_kernel void @permlane64_nonuniform_expression(i32 addrspace(1)* %out) { +; CHECK-LABEL: permlane64_nonuniform_expression: +; CHECK:       ; %bb.0: +; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; CHECK-NEXT:    v_add_nc_u32_e32 v1, 1, v0 +; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0 +; CHECK-NEXT:    v_permlane64_b32 v1, v1 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT:    s_endpgm +  %tid = call i32 @llvm.amdgcn.workitem.id.x() +  %tid2 = add i32 %tid, 1 +  %v = call i32 @llvm.amdgcn.permlane64(i32 %tid2) +  %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid +  store i32 %v, i32 addrspace(1)* %out_ptr +  ret void +} + +define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) %out) { +; CHECK-LABEL: trivial_waterfall_eq_zero: +; CHECK:       ; %bb.0: ; %entry +; CHECK-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 +; CHECK-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5 +; CHECK-NEXT:    s_mov_b32 s2, 0 +; CHECK-NEXT:    s_branch .LBB7_2 +; CHECK-NEXT:  .LBB7_1: ; %Flow +; CHECK-NEXT:    ; in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s2 +; CHECK-NEXT:    s_mov_b32 s2, -1 +; CHECK-NEXT:    s_cbranch_vccz .LBB7_4 +; CHECK-NEXT:  .LBB7_2: ; %while +; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT:    s_and_b32 vcc_lo, exec_lo, s2 +; CHECK-NEXT:    s_mov_b32 s2, -1 +; CHECK-NEXT:    s_cbranch_vccnz .LBB7_1 +; CHECK-NEXT:  ; %bb.3: ; %if +; CHECK-NEXT:    ; in Loop: Header=BB7_2 Depth=1 +; CHECK-NEXT:    s_mov_b32 s2, 0 +; CHECK-NEXT:    s_waitcnt lgkmcnt(0) +; CHECK-NEXT:    global_store_b32 v0, v1, s[0:1] +; CHECK-NEXT:    s_branch .LBB7_1 +; CHECK-NEXT:  .LBB7_4: ; %exit +; CHECK-NEXT:    s_endpgm +entry: +  br label %while + +while: +  %done = phi i1 [ 0, %entry ], [ 1, %if ] +  %not_done = xor i1 %done, true +  %ballot = tail call i64 @llvm.amdgcn.ballot.i64(i1 %not_done) +  %is_done = icmp eq i64 %ballot, 0 ; in this case is_done = !not_done +  br i1 %is_done, label %exit, label %if + +if: +  store i32 5, ptr addrspace(1) %out +  br label %while + +exit: +  ret void +} | 
