diff options
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/wait-xcnt.mir | 176 | ||||
| -rw-r--r-- | llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll | 63 | ||||
| -rw-r--r-- | llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll | 63 | ||||
| -rw-r--r-- | llvm/test/CodeGen/NVPTX/f16-ex2.ll | 40 | ||||
| -rw-r--r-- | llvm/test/CodeGen/NVPTX/f32-ex2.ll | 7 |
5 files changed, 339 insertions, 10 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index 1b8e126..a1381ec 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -945,7 +945,6 @@ body: | $vgpr0 = V_MOV_B32_e32 0, implicit $exec ... -# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0. --- name: wait_kmcnt_with_outstanding_vmem_2 tracksRegLiveness: true @@ -971,6 +970,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_WAIT_KMCNT 0 ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec bb.0: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc @@ -986,6 +986,180 @@ body: | ... --- +name: wait_kmcnt_and_wait_loadcnt +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: wait_kmcnt_and_wait_loadcnt + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_LOADCNT 0 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $sgpr2 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + bb.2: + liveins: $sgpr2 + $sgpr2 = S_MOV_B32 $sgpr2 + $vgpr2 = V_MOV_B32_e32 0, implicit $exec +... + +--- +name: implicit_handling_of_pending_vmem_group +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: implicit_handling_of_pending_vmem_group + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $sgpr2 + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + bb.2: + liveins: $sgpr0_sgpr1, $sgpr2 + $sgpr2 = S_MOV_B32 $sgpr2 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $sgpr0 = S_MOV_B32 $sgpr0 +... + +--- +name: pending_vmem_event_between_block +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: pending_vmem_event_between_block + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_KMCNT 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_XCNT 1 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2 + $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec + bb.2: + liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2 + $sgpr2 = S_MOV_B32 $sgpr2 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $sgpr0 = S_MOV_B32 $sgpr0 +... + +--- +name: flushing_vmem_cnt_on_block_entry +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true +body: | + ; GCN-LABEL: name: flushing_vmem_cnt_on_block_entry + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAIT_XCNT 0 + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0 + bb.0: + liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0 + S_CBRANCH_SCC1 %bb.2, implicit $scc + bb.1: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2 + $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec + bb.2: + liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + $sgpr0 = S_MOV_B32 $sgpr0 +... + +--- name: wait_loadcnt_with_outstanding_smem tracksRegLiveness: true machineFunctionInfo: diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll index ba2118f..b3155c9 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll @@ -106,6 +106,69 @@ define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind { ret void } +define void @not_ctlz_v32i8(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvxori.b $xr0, $xr0, 255 +; CHECK-NEXT: xvclz.b $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <32 x i8>, ptr %src + %neg = xor <32 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %neg, i1 false) + store <32 x i8> %res, ptr %dst + ret void +} + +define void @not_ctlz_v16i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvclz.h $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <16 x i16>, ptr %src + %neg = xor <16 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %neg, i1 false) + store <16 x i16> %res, ptr %dst + ret void +} + +define void @not_ctlz_v8i32(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvclz.w $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <8 x i32>, ptr %src + %neg = xor <8 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> + %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %neg, i1 false) + store <8 x i32> %res, ptr %dst + ret void +} + +define void @not_ctlz_v4i64(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvrepli.b $xr1, -1 +; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvclz.d $xr0, $xr0 +; CHECK-NEXT: xvst $xr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i64>, ptr %src + %neg = xor <4 x i64> %v, <i64 -1, i64 -1, i64 -1, i64 -1> + %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %neg, i1 false) + store <4 x i64> %res, ptr %dst + ret void +} + declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>) declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll index a9a38e8..6ac7d51 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll @@ -106,6 +106,69 @@ define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind { ret void } +define void @not_ctlz_v16i8(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vxori.b $vr0, $vr0, 255 +; CHECK-NEXT: vclz.b $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <16 x i8>, ptr %src + %neg = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %neg, i1 false) + store <16 x i8> %res, ptr %dst + ret void +} + +define void @not_ctlz_v8i16(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vrepli.b $vr1, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vclz.h $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <8 x i16>, ptr %src + %neg = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %neg, i1 false) + store <8 x i16> %res, ptr %dst + ret void +} + +define void @not_ctlz_v4i32(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vrepli.b $vr1, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vclz.w $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <4 x i32>, ptr %src + %neg = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1> + %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %neg, i1 false) + store <4 x i32> %res, ptr %dst + ret void +} + +define void @not_ctlz_v2i64(ptr %src, ptr %dst) nounwind { +; CHECK-LABEL: not_ctlz_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vrepli.b $vr1, -1 +; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 +; CHECK-NEXT: vclz.d $vr0, $vr0 +; CHECK-NEXT: vst $vr0, $a1, 0 +; CHECK-NEXT: ret + %v = load <2 x i64>, ptr %src + %neg = xor <2 x i64> %v, <i64 -1, i64 -1> + %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %neg, i1 false) + store <2 x i64> %res, ptr %dst + ret void +} + declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>) diff --git a/llvm/test/CodeGen/NVPTX/f16-ex2.ll b/llvm/test/CodeGen/NVPTX/f16-ex2.ll index ee79f9d..af3fe67 100644 --- a/llvm/test/CodeGen/NVPTX/f16-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f16-ex2.ll @@ -1,12 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=sm_75 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK-FP16 %s -; RUN: %if ptxas-sm_75 && ptxas-isa-7.0 %{ llc < %s -mcpu=sm_75 -mattr=+ptx70 | %ptxas-verify -arch=sm_75 %} +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK-FP16 %s +; RUN: %if ptxas-sm_90 && ptxas-isa-7.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" declare half @llvm.nvvm.ex2.approx.f16(half) -declare <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half>) +declare <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half>) +declare bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat) +declare <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat>) -; CHECK-LABEL: ex2_half define half @ex2_half(half %0) { ; CHECK-FP16-LABEL: ex2_half( ; CHECK-FP16: { @@ -21,7 +22,6 @@ define half @ex2_half(half %0) { ret half %res } -; CHECK-LABEL: ex2_2xhalf define <2 x half> @ex2_2xhalf(<2 x half> %0) { ; CHECK-FP16-LABEL: ex2_2xhalf( ; CHECK-FP16: { @@ -32,6 +32,34 @@ define <2 x half> @ex2_2xhalf(<2 x half> %0) { ; CHECK-FP16-NEXT: ex2.approx.f16x2 %r2, %r1; ; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-FP16-NEXT: ret; - %res = call <2 x half> @llvm.nvvm.ex2.approx.f16x2(<2 x half> %0) + %res = call <2 x half> @llvm.nvvm.ex2.approx.v2f16(<2 x half> %0) ret <2 x half> %res } + +define bfloat @ex2_bfloat(bfloat %0) { +; CHECK-FP16-LABEL: ex2_bfloat( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b16 %rs<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: +; CHECK-FP16-NEXT: ld.param.b16 %rs1, [ex2_bfloat_param_0]; +; CHECK-FP16-NEXT: ex2.approx.ftz.bf16 %rs2, %rs1; +; CHECK-FP16-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-FP16-NEXT: ret; + %res = call bfloat @llvm.nvvm.ex2.approx.ftz.bf16(bfloat %0) + ret bfloat %res +} + +define <2 x bfloat> @ex2_2xbfloat(<2 x bfloat> %0) { +; CHECK-FP16-LABEL: ex2_2xbfloat( +; CHECK-FP16: { +; CHECK-FP16-NEXT: .reg .b32 %r<3>; +; CHECK-FP16-EMPTY: +; CHECK-FP16-NEXT: // %bb.0: +; CHECK-FP16-NEXT: ld.param.b32 %r1, [ex2_2xbfloat_param_0]; +; CHECK-FP16-NEXT: ex2.approx.ftz.bf16x2 %r2, %r1; +; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-FP16-NEXT: ret; + %res = call <2 x bfloat> @llvm.nvvm.ex2.approx.ftz.v2bf16(<2 x bfloat> %0) + ret <2 x bfloat> %res +} diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll index 796d80d..97b9d35 100644 --- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll @@ -3,7 +3,8 @@ ; RUN: %if ptxas-sm_50 && ptxas-isa-3.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_50 -mattr=+ptx32 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx-nvidia-cuda" -declare float @llvm.nvvm.ex2.approx.f(float) +declare float @llvm.nvvm.ex2.approx.f32(float) +declare float @llvm.nvvm.ex2.approx.ftz.f32(float) ; CHECK-LABEL: ex2_float define float @ex2_float(float %0) { @@ -16,7 +17,7 @@ define float @ex2_float(float %0) { ; CHECK-NEXT: ex2.approx.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %res = call float @llvm.nvvm.ex2.approx.f(float %0) + %res = call float @llvm.nvvm.ex2.approx.f32(float %0) ret float %res } @@ -31,6 +32,6 @@ define float @ex2_float_ftz(float %0) { ; CHECK-NEXT: ex2.approx.ftz.f32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; - %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0) + %res = call float @llvm.nvvm.ex2.approx.ftz.f32(float %0) ret float %res } |
