; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s define amdgpu_kernel void @memoryIntrinstic(ptr addrspace(3) %inptr, i1 %cond, ptr addrspace(3) %outptr) { ; CHECK-LABEL: memoryIntrinstic: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp0_b32 s1, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %else ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 offset:8192 ; CHECK-NEXT: s_mov_b32 s1, 0x7060302 ; CHECK-NEXT: s_mov_b32 s3, 0x5040100 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s1 ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s3 ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: s_branch .LBB0_4 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: .LBB0_3: ; %then ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 offset:8192 ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 ; CHECK-NEXT: s_mov_b32 s1, 0x7060302 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s1 ; CHECK-NEXT: .LBB0_4: ; %end ; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: ds_write_b64 v2, v[0:1] ; CHECK-NEXT: s_endpgm %gep0 = getelementptr ptr addrspace(3), ptr addrspace(3) %inptr, i32 2048 br i1 %cond, label %then, label %else then: %load0 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) %shuf0 = shufflevector <4 x half> %load0, <4 x half> %load0, <4 x i32> br label %end else: %load1 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) %shuf1 = shufflevector <4 x half> %load1, <4 x half> %load1, <4 x i32> br label %end end: %res = phi <4 x half> [ %shuf0, %then ], [ %shuf1, %else ] store <4 x half> %res, ptr addrspace(3) %outptr ret void } define amdgpu_kernel void @badIntrinsicUse(ptr addrspace(3) %inptr, i1 %cond, ptr addrspace(3) %outptr, <4 x i32> %rsrc) { ; CHECK-LABEL: badIntrinsicUse: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_and_b32 s1, s1, 1 ; CHECK-NEXT: s_add_i32 s3, s0, 0x2000 ; CHECK-NEXT: s_cmp_eq_u32 s1, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB1_2 ; CHECK-NEXT: ; %bb.1: ; %else ; CHECK-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 ; CHECK-NEXT: s_mov_b32 s0, 0x7060302 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s0 ; CHECK-NEXT: s_cbranch_execz .LBB1_3 ; CHECK-NEXT: s_branch .LBB1_4 ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: .LBB1_3: ; %then ; CHECK-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 ; CHECK-NEXT: s_mov_b32 s1, 0x7060302 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s1 ; CHECK-NEXT: .LBB1_4: ; %end ; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: ds_write_b64 v2, v[0:1] ; CHECK-NEXT: s_endpgm %gep0 = getelementptr ptr addrspace(3), ptr addrspace(3) %inptr, i32 2048 br i1 %cond, label %then, label %else then: %load0 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) %shuf0 = shufflevector <4 x half> %load0, <4 x half> %load0, <4 x i32> br label %end else: %load1 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) call void @llvm.amdgcn.raw.buffer.store(ptr addrspace(3) %gep0, <4 x i32> %rsrc, i32 0, i32 0, i32 0) %shuf1 = shufflevector <4 x half> %load1, <4 x half> %load1, <4 x i32> br label %end end: %res = phi <4 x half> [ %shuf0, %then ], [ %shuf1, %else ] store <4 x half> %res, ptr addrspace(3) %outptr ret void } define amdgpu_kernel void @badIntrinsicUse2(ptr addrspace(3) %inptr, i1 %cond, ptr addrspace(3) %outptr, ptr addrspace(3) %outptr1) { ; CHECK-LABEL: badIntrinsicUse2: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_and_b32 s1, s1, 1 ; CHECK-NEXT: s_add_i32 s4, s0, 0x2000 ; CHECK-NEXT: s_cmp_eq_u32 s1, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB2_2 ; CHECK-NEXT: ; %bb.1: ; %else ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 ; CHECK-NEXT: v_mov_b32_e32 v0, s3 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: s_mov_b32 s0, 0x7060302 ; CHECK-NEXT: ds_write_b32 v0, v1 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s0 ; CHECK-NEXT: s_cbranch_execz .LBB2_3 ; CHECK-NEXT: s_branch .LBB2_4 ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: .LBB2_3: ; %then ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: ds_read_b64_tr_b16 v[2:3], v0 ; CHECK-NEXT: s_mov_b32 s0, 0x5040100 ; CHECK-NEXT: s_mov_b32 s1, 0x7060302 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_perm_b32 v0, v3, v2, s0 ; CHECK-NEXT: v_perm_b32 v1, v3, v2, s1 ; CHECK-NEXT: .LBB2_4: ; %end ; CHECK-NEXT: v_mov_b32_e32 v2, s2 ; CHECK-NEXT: ds_write_b64 v2, v[0:1] ; CHECK-NEXT: s_endpgm %gep0 = getelementptr ptr addrspace(3), ptr addrspace(3) %inptr, i32 2048 br i1 %cond, label %then, label %else then: %load0 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) %shuf0 = shufflevector <4 x half> %load0, <4 x half> %load0, <4 x i32> br label %end else: %load1 = tail call <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) %gep0) %gep1 = call ptr addrspace(3) @llvm.amdgcn.readfirstlane(ptr addrspace(3) %gep0) store ptr addrspace(3) %gep1, ptr addrspace(3) %outptr1 %shuf1 = shufflevector <4 x half> %load1, <4 x half> %load1, <4 x i32> br label %end end: %res = phi <4 x half> [ %shuf0, %then ], [ %shuf1, %else ] store <4 x half> %res, ptr addrspace(3) %outptr ret void }