diff options
Diffstat (limited to 'llvm/test/CodeGen/NVPTX')
-rw-r--r-- | llvm/test/CodeGen/NVPTX/bug26185-2.ll | 9 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/combine-wide.ll | 1339 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/fold-movs.ll | 38 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 168 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/ld-param-sink.ll | 47 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/local-stack-frame.ll | 7 | ||||
-rw-r--r-- | llvm/test/CodeGen/NVPTX/vector-loads.ll | 11 |
7 files changed, 1536 insertions, 83 deletions
diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll index 4e11f58..46172b1 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll @@ -16,7 +16,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p ; CHECK: .maxntid 1, 1, 1 ; CHECK-NEXT: { ; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<9>; +; CHECK-NEXT: .reg .b64 %rd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %bb ; CHECK-NEXT: ld.param.b64 %rd1, [spam_param_0]; @@ -25,10 +25,9 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p ; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3; ; CHECK-NEXT: ld.param.b64 %rd5, [spam_param_1]; ; CHECK-NEXT: ld.global.nc.s16 %r1, [%rd4+16]; -; CHECK-NEXT: mul.wide.s32 %rd6, %r1, %r1; -; CHECK-NEXT: ld.global.b64 %rd7, [%rd5]; -; CHECK-NEXT: add.s64 %rd8, %rd6, %rd7; -; CHECK-NEXT: st.global.b64 [%rd5], %rd8; +; CHECK-NEXT: ld.global.b64 %rd6, [%rd5]; +; CHECK-NEXT: mad.wide.s32 %rd7, %r1, %r1, %rd6; +; CHECK-NEXT: st.global.b64 [%rd5], %rd7; ; CHECK-NEXT: ret; bb: %tmp5 = add nsw i64 %arg3, 8 diff --git a/llvm/test/CodeGen/NVPTX/combine-wide.ll b/llvm/test/CodeGen/NVPTX/combine-wide.ll new file mode 100644 index 0000000..ed4a2b6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/combine-wide.ll @@ -0,0 +1,1339 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -O1 | FileCheck %s --check-prefixes=CHECK,O1 +; RUN: llc < %s -O0 | FileCheck %s --check-prefixes=CHECK,O0 + +target triple = "nvptx64-nvidia-cuda" + +define i64 @t1(i32 %a, i32 %b, i64 %c) { +; +; O1-LABEL: t1( +; O1: { +; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b64 %rd<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t1_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t1_param_1]; +; O1-NEXT: ld.param.b64 %rd1, [t1_param_2]; +; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1; +; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: ret; +; +; O0-LABEL: t1( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [t1_param_2]; +; O0-NEXT: ld.param.b32 %r2, [t1_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t1_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.s64.s32 %rd2, %r3; +; O0-NEXT: add.s64 %rd3, %rd1, %rd2; +; O0-NEXT: st.param.b64 [func_retval0], %rd3; +; O0-NEXT: ret; + %mul = mul nsw i32 %a, %b + %sext = sext i32 %mul to i64 + %add = add i64 %c, %sext + ret i64 %add +} + +define i64 @t2(i32 %a, i32 %b, i64 %c) { +; +; O1-LABEL: t2( +; O1: { +; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b64 %rd<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t2_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t2_param_1]; +; O1-NEXT: ld.param.b64 %rd1, [t2_param_2]; +; O1-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1; +; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: ret; +; +; O0-LABEL: t2( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [t2_param_2]; +; O0-NEXT: ld.param.b32 %r2, [t2_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t2_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.s64.s32 %rd2, %r3; +; O0-NEXT: add.s64 %rd3, %rd2, %rd1; +; O0-NEXT: st.param.b64 [func_retval0], %rd3; +; O0-NEXT: ret; + %mul = mul nsw i32 %a, %b + %sext = sext i32 %mul to i64 + %add = add i64 %sext, %c + ret i64 %add +} + +define i64 @t3(i32 %a, i32 %b) { +; +; O1-LABEL: t3( +; O1: { +; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t3_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t3_param_1]; +; O1-NEXT: mad.wide.s32 %rd1, %r1, %r2, 1; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t3( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [t3_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t3_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.s64.s32 %rd1, %r3; +; O0-NEXT: add.s64 %rd2, %rd1, 1; +; O0-NEXT: st.param.b64 [func_retval0], %rd2; +; O0-NEXT: ret; + %mul = mul nsw i32 %a, %b + %sext = sext i32 %mul to i64 + %add = add i64 1, %sext + ret i64 %add +} + +define i64 @t4(i32 %a, i64 %c) { +; +; O1-LABEL: t4( +; O1: { +; O1-NEXT: .reg .b32 %r<2>; +; O1-NEXT: .reg .b64 %rd<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t4_param_0]; +; O1-NEXT: ld.param.b64 %rd1, [t4_param_1]; +; O1-NEXT: mad.wide.s32 %rd2, %r1, 3, %rd1; +; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: ret; +; +; O0-LABEL: t4( +; O0: { +; O0-NEXT: .reg .b32 %r<3>; +; O0-NEXT: .reg .b64 %rd<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [t4_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t4_param_0]; +; O0-NEXT: mul.lo.s32 %r2, %r1, 3; +; O0-NEXT: cvt.s64.s32 %rd2, %r2; +; O0-NEXT: add.s64 %rd3, %rd1, %rd2; +; O0-NEXT: st.param.b64 [func_retval0], %rd3; +; O0-NEXT: ret; + %mul = mul nsw i32 %a, 3 + %sext = sext i32 %mul to i64 + %add = add i64 %c, %sext + ret i64 %add +} + +define i64 @t4_1(i32 %a, i64 %c) { +; +; O1-LABEL: t4_1( +; O1: { +; O1-NEXT: .reg .b32 %r<2>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t4_1_param_0]; +; O1-NEXT: mad.wide.s32 %rd1, %r1, 3, 5; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t4_1( +; O0: { +; O0-NEXT: .reg .b32 %r<3>; +; O0-NEXT: .reg .b64 %rd<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [t4_1_param_0]; +; O0-NEXT: mul.lo.s32 %r2, %r1, 3; +; O0-NEXT: cvt.s64.s32 %rd1, %r2; +; O0-NEXT: add.s64 %rd2, %rd1, 5; +; O0-NEXT: st.param.b64 [func_retval0], %rd2; +; O0-NEXT: ret; + %mul = mul nsw i32 %a, 3 + %sext = sext i32 %mul to i64 + %add = add i64 5, %sext + ret i64 %add +} + +define i64 @t5(i32 %a, i32 %b, i64 %c) { +; +; O1-LABEL: t5( +; O1: { +; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b64 %rd<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t5_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t5_param_1]; +; O1-NEXT: ld.param.b64 %rd1, [t5_param_2]; +; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1; +; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: ret; +; +; O0-LABEL: t5( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [t5_param_2]; +; O0-NEXT: ld.param.b32 %r2, [t5_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t5_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.u64.u32 %rd2, %r3; +; O0-NEXT: add.s64 %rd3, %rd1, %rd2; +; O0-NEXT: st.param.b64 [func_retval0], %rd3; +; O0-NEXT: ret; + %mul = mul nuw i32 %a, %b + %zext = zext i32 %mul to i64 + %add = add i64 %c, %zext + ret i64 %add +} + +define i64 @t6(i32 %a, i32 %b, i64 %c) { +; +; O1-LABEL: t6( +; O1: { +; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b64 %rd<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t6_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t6_param_1]; +; O1-NEXT: ld.param.b64 %rd1, [t6_param_2]; +; O1-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1; +; O1-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NEXT: ret; +; +; O0-LABEL: t6( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [t6_param_2]; +; O0-NEXT: ld.param.b32 %r2, [t6_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t6_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.u64.u32 %rd2, %r3; +; O0-NEXT: add.s64 %rd3, %rd2, %rd1; +; O0-NEXT: st.param.b64 [func_retval0], %rd3; +; O0-NEXT: ret; + %mul = mul nuw i32 %a, %b + %zext = zext i32 %mul to i64 + %add = add i64 %zext, %c + ret i64 %add +} + +define i32 @t7(i16 %a, i16 %b) { +; +; O1-LABEL: t7( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t7_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t7_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: cvt.u32.u16 %r1, %rs3; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t7( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t7_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t7_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.u32.u16 %r1, %rs3; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = mul i16 %a, %b + %zext = zext i16 %mul to i32 + ret i32 %zext +} + +define i32 @t8(i16 %a, i16 %b) { +; +; O1-LABEL: t8( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t8_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t8_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: cvt.s32.s16 %r1, %rs3; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t8( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t8_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t8_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.s32.s16 %r1, %rs3; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = mul i16 %a, %b + %sext = sext i16 %mul to i32 + ret i32 %sext +} + +define i64 @t9(i32 %a, i32 %b) { +; +; O1-LABEL: t9( +; O1: { +; O1-NEXT: .reg .b32 %r<4>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t9_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t9_param_1]; +; O1-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O1-NEXT: cvt.u64.u32 %rd1, %r3; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t9( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [t9_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t9_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.u64.u32 %rd1, %r3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = mul i32 %a, %b + %zext = zext i32 %mul to i64 + ret i64 %zext +} + +define i64 @t10(i32 %a, i32 %b) { +; +; O1-LABEL: t10( +; O1: { +; O1-NEXT: .reg .b32 %r<4>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t10_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t10_param_1]; +; O1-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O1-NEXT: cvt.s64.s32 %rd1, %r3; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t10( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [t10_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t10_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.s64.s32 %rd1, %r3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = mul i32 %a, %b + %sext = sext i32 %mul to i64 + ret i64 %sext +} + +define i32 @t11(i16 %a, i16 %b) { +; +; O1-LABEL: t11( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t11_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t11_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: cvt.u32.u16 %r1, %rs3; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t11( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t11_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t11_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.u32.u16 %r1, %rs3; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = mul nsw i16 %a, %b + %zext = zext i16 %mul to i32 + ret i32 %zext +} + +define i32 @t12(i16 %a, i16 %b) { +; +; O1-LABEL: t12( +; O1: { +; O1-NEXT: .reg .b16 %rs<3>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t12_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t12_param_1]; +; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t12( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t12_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t12_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.s32.s16 %r1, %rs3; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = mul nsw i16 %a, %b + %sext = sext i16 %mul to i32 + ret i32 %sext +} + +define i64 @t13(i32 %a, i32 %b) { +; +; O1-LABEL: t13( +; O1: { +; O1-NEXT: .reg .b32 %r<4>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t13_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t13_param_1]; +; O1-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O1-NEXT: cvt.u64.u32 %rd1, %r3; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t13( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [t13_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t13_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.u64.u32 %rd1, %r3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = mul nsw i32 %a, %b + %zext = zext i32 %mul to i64 + ret i64 %zext +} + +define i64 @t14(i32 %a, i32 %b) { +; +; O1-LABEL: t14( +; O1: { +; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t14_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t14_param_1]; +; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t14( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [t14_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t14_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.s64.s32 %rd1, %r3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = mul nsw i32 %a, %b + %sext = sext i32 %mul to i64 + ret i64 %sext +} + +define i32 @t15(i16 %a, i16 %b) { +; +; O1-LABEL: t15( +; O1: { +; O1-NEXT: .reg .b16 %rs<3>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t15_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t15_param_1]; +; O1-NEXT: mul.wide.u16 %r1, %rs1, %rs2; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t15( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t15_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t15_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.u32.u16 %r1, %rs3; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = mul nuw i16 %a, %b + %zext = zext i16 %mul to i32 + ret i32 %zext +} + +define i32 @t16(i16 %a, i16 %b) { +; +; O1-LABEL: t16( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t16_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t16_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: cvt.s32.s16 %r1, %rs3; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t16( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t16_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t16_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.s32.s16 %r1, %rs3; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = mul nuw i16 %a, %b + %sext = sext i16 %mul to i32 + ret i32 %sext +} + +define i64 @t17(i32 %a, i32 %b) { +; +; O1-LABEL: t17( +; O1: { +; O1-NEXT: .reg .b32 %r<3>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t17_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t17_param_1]; +; O1-NEXT: mul.wide.u32 %rd1, %r1, %r2; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t17( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [t17_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t17_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.u64.u32 %rd1, %r3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = mul nuw i32 %a, %b + %zext = zext i32 %mul to i64 + ret i64 %zext +} + +define i64 @t18(i32 %a, i32 %b) { +; +; O1-LABEL: t18( +; O1: { +; O1-NEXT: .reg .b32 %r<4>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t18_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t18_param_1]; +; O1-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O1-NEXT: cvt.s64.s32 %rd1, %r3; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t18( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [t18_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t18_param_0]; +; O0-NEXT: mul.lo.s32 %r3, %r1, %r2; +; O0-NEXT: cvt.s64.s32 %rd1, %r3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = mul nuw i32 %a, %b + %sext = sext i32 %mul to i64 + ret i64 %sext +} + +define i32 @t19(i16 %a, i16 %b) { +; +; O1-LABEL: t19( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t19_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t19_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: cvt.u32.u16 %r1, %rs3; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t19( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t19_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t19_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.u32.u16 %r1, %rs3; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = mul i16 %a, %b + %zext = zext i16 %mul to i32 + ret i32 %zext +} + +define i32 @t20(i16 %a) { +; +; CHECK-LABEL: t20( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [t20_param_0]; +; CHECK-NEXT: shl.b16 %rs2, %rs1, 4; +; CHECK-NEXT: cvt.s32.s16 %r1, %rs2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %mul = shl i16 %a, 4 + %sext = sext i16 %mul to i32 + ret i32 %sext +} + +define i64 @t21(i32 %a) { +; +; CHECK-LABEL: t21( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [t21_param_0]; +; CHECK-NEXT: shl.b32 %r2, %r1, 4; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %mul = shl i32 %a, 4 + %zext = zext i32 %mul to i64 + ret i64 %zext +} + +define i64 @t22(i32 %a) { +; +; CHECK-LABEL: t22( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [t22_param_0]; +; CHECK-NEXT: shl.b32 %r2, %r1, 4; +; CHECK-NEXT: cvt.s64.s32 %rd1, %r2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %mul = shl i32 %a, 4 + %sext = sext i32 %mul to i64 + ret i64 %sext +} + +define i32 @t23(i16 %a, i16 %b) { +; +; CHECK-LABEL: t23( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [t23_param_0]; +; CHECK-NEXT: shl.b16 %rs2, %rs1, 4; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %mul = shl nsw i16 %a, 4 + %zext = zext i16 %mul to i32 + ret i32 %zext +} + +define i32 @t24(i16 %a, i16 %b) { +; +; O1-LABEL: t24( +; O1: { +; O1-NEXT: .reg .b16 %rs<2>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t24_param_0]; +; O1-NEXT: mul.wide.s16 %r1, %rs1, 16; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t24( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs1, [t24_param_0]; +; O0-NEXT: shl.b16 %rs2, %rs1, 4; +; O0-NEXT: cvt.s32.s16 %r1, %rs2; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = shl nsw i16 %a, 4 + %sext = sext i16 %mul to i32 + ret i32 %sext +} + +define i64 @t25(i32 %a) { +; +; CHECK-LABEL: t25( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [t25_param_0]; +; CHECK-NEXT: shl.b32 %r2, %r1, 4; +; CHECK-NEXT: cvt.u64.u32 %rd1, %r2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %mul = shl nsw i32 %a, 4 + %zext = zext i32 %mul to i64 + ret i64 %zext +} + +define i64 @t26(i32 %a) { +; +; O1-LABEL: t26( +; O1: { +; O1-NEXT: .reg .b32 %r<2>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t26_param_0]; +; O1-NEXT: mul.wide.s32 %rd1, %r1, 16; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t26( +; O0: { +; O0-NEXT: .reg .b32 %r<3>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [t26_param_0]; +; O0-NEXT: shl.b32 %r2, %r1, 4; +; O0-NEXT: cvt.s64.s32 %rd1, %r2; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = shl nsw i32 %a, 4 + %sext = sext i32 %mul to i64 + ret i64 %sext +} + +define i32 @t27(i16 %a, i16 %b) { +; +; O1-LABEL: t27( +; O1: { +; O1-NEXT: .reg .b16 %rs<2>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t27_param_0]; +; O1-NEXT: mul.wide.u16 %r1, %rs1, 16; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t27( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs1, [t27_param_0]; +; O0-NEXT: shl.b16 %rs2, %rs1, 4; +; O0-NEXT: cvt.u32.u16 %r1, %rs2; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = shl nuw i16 %a, 4 + %zext = zext i16 %mul to i32 + ret i32 %zext +} + +define i32 @t28(i16 %a, i16 %b) { +; +; CHECK-LABEL: t28( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [t28_param_0]; +; CHECK-NEXT: shl.b16 %rs2, %rs1, 4; +; CHECK-NEXT: cvt.s32.s16 %r1, %rs2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %mul = shl nuw i16 %a, 4 + %sext = sext i16 %mul to i32 + ret i32 %sext +} + +define i64 @t29(i32 %a) { +; +; O1-LABEL: t29( +; O1: { +; O1-NEXT: .reg .b32 %r<2>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t29_param_0]; +; O1-NEXT: mul.wide.u32 %rd1, %r1, 16; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t29( +; O0: { +; O0-NEXT: .reg .b32 %r<3>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [t29_param_0]; +; O0-NEXT: shl.b32 %r2, %r1, 4; +; O0-NEXT: cvt.u64.u32 %rd1, %r2; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = shl nuw i32 %a, 4 + %zext = zext i32 %mul to i64 + ret i64 %zext +} + +define i64 @t30(i32 %a) { +; +; CHECK-LABEL: t30( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [t30_param_0]; +; CHECK-NEXT: shl.b32 %r2, %r1, 4; +; CHECK-NEXT: cvt.s64.s32 %rd1, %r2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-NEXT: ret; + %mul = shl nuw i32 %a, 4 + %sext = sext i32 %mul to i64 + ret i64 %sext +} + +define i64 @t31(i32 %a, i32 %b) { +; +; O1-LABEL: t31( +; O1: { +; O1-NEXT: .reg .b32 %r<4>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b32 %r1, [t31_param_0]; +; O1-NEXT: ld.param.b32 %r2, [t31_param_1]; +; O1-NEXT: shl.b32 %r3, %r1, %r2; +; O1-NEXT: cvt.s64.s32 %rd1, %r3; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t31( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [t31_param_1]; +; O0-NEXT: ld.param.b32 %r1, [t31_param_0]; +; O0-NEXT: shl.b32 %r3, %r1, %r2; +; O0-NEXT: cvt.s64.s32 %rd1, %r3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = shl nuw i32 %a, %b + %sext = sext i32 %mul to i64 + ret i64 %sext +} + +define i32 @t32(i16 %a, i16 %b, i32 %c) { +; +; O1-LABEL: t32( +; O1: { +; O1-NEXT: .reg .b16 %rs<3>; +; O1-NEXT: .reg .b32 %r<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t32_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t32_param_1]; +; O1-NEXT: ld.param.b32 %r1, [t32_param_2]; +; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: ret; +; +; O0-LABEL: t32( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [t32_param_2]; +; O0-NEXT: ld.param.b16 %rs2, [t32_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t32_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.s32.s16 %r2, %rs3; +; O0-NEXT: add.s32 %r3, %r1, %r2; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; + %mul = mul nsw i16 %a, %b + %sext = sext i16 %mul to i32 + %add = add i32 %c, %sext + ret i32 %add +} + +define i32 @t33(i16 %a, i16 %b, i32 %c) { +; +; O1-LABEL: t33( +; O1: { +; O1-NEXT: .reg .b16 %rs<3>; +; O1-NEXT: .reg .b32 %r<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t33_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t33_param_1]; +; O1-NEXT: ld.param.b32 %r1, [t33_param_2]; +; O1-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: ret; +; +; O0-LABEL: t33( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [t33_param_2]; +; O0-NEXT: ld.param.b16 %rs2, [t33_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t33_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.s32.s16 %r2, %rs3; +; O0-NEXT: add.s32 %r3, %r1, %r2; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; + %mul = mul nsw i16 %a, %b + %sext = sext i16 %mul to i32 + %add = add i32 %c, %sext + ret i32 %add +} + +define i32 @t34(i16 %a, i16 %b) { +; +; O1-LABEL: t34( +; O1: { +; O1-NEXT: .reg .b16 %rs<3>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t34_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t34_param_1]; +; O1-NEXT: mad.wide.s16 %r1, %rs1, %rs2, 1; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t34( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t34_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t34_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.s32.s16 %r1, %rs3; +; O0-NEXT: add.s32 %r2, %r1, 1; +; O0-NEXT: st.param.b32 [func_retval0], %r2; +; O0-NEXT: ret; + %mul = mul nsw i16 %a, %b + %sext = sext i16 %mul to i32 + %add = add i32 1, %sext + ret i32 %add +} + +define i32 @t35(i16 %a, i32 %c) { +; +; O1-LABEL: t35( +; O1: { +; O1-NEXT: .reg .b16 %rs<2>; +; O1-NEXT: .reg .b32 %r<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t35_param_0]; +; O1-NEXT: ld.param.b32 %r1, [t35_param_1]; +; O1-NEXT: mad.wide.s16 %r2, %rs1, 3, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: ret; +; +; O0-LABEL: t35( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [t35_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t35_param_0]; +; O0-NEXT: mul.lo.s16 %rs2, %rs1, 3; +; O0-NEXT: cvt.s32.s16 %r2, %rs2; +; O0-NEXT: add.s32 %r3, %r1, %r2; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; + %mul = mul nsw i16 %a, 3 + %sext = sext i16 %mul to i32 + %add = add i32 %c, %sext + ret i32 %add +} + +define i32 @t36(i16 %a, i32 %c) { +; +; O1-LABEL: t36( +; O1: { +; O1-NEXT: .reg .b16 %rs<2>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t36_param_0]; +; O1-NEXT: mad.wide.s16 %r1, %rs1, 3, 5; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t36( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b32 %r<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs1, [t36_param_0]; +; O0-NEXT: mul.lo.s16 %rs2, %rs1, 3; +; O0-NEXT: cvt.s32.s16 %r1, %rs2; +; O0-NEXT: add.s32 %r2, %r1, 5; +; O0-NEXT: st.param.b32 [func_retval0], %r2; +; O0-NEXT: ret; + %mul = mul nsw i16 %a, 3 + %sext = sext i16 %mul to i32 + %add = add i32 5, %sext + ret i32 %add +} + +define i32 @t37(i16 %a, i16 %b, i32 %c) { +; +; O1-LABEL: t37( +; O1: { +; O1-NEXT: .reg .b16 %rs<3>; +; O1-NEXT: .reg .b32 %r<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t37_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t37_param_1]; +; O1-NEXT: ld.param.b32 %r1, [t37_param_2]; +; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: ret; +; +; O0-LABEL: t37( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [t37_param_2]; +; O0-NEXT: ld.param.b16 %rs2, [t37_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t37_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.u32.u16 %r2, %rs3; +; O0-NEXT: add.s32 %r3, %r1, %r2; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; + %mul = mul nuw i16 %a, %b + %zext = zext i16 %mul to i32 + %add = add i32 %c, %zext + ret i32 %add +} + +define i32 @t38(i16 %a, i16 %b, i32 %c) { +; +; O1-LABEL: t38( +; O1: { +; O1-NEXT: .reg .b16 %rs<3>; +; O1-NEXT: .reg .b32 %r<3>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t38_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t38_param_1]; +; O1-NEXT: ld.param.b32 %r1, [t38_param_2]; +; O1-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NEXT: ret; +; +; O0-LABEL: t38( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [t38_param_2]; +; O0-NEXT: ld.param.b16 %rs2, [t38_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t38_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.u32.u16 %r2, %rs3; +; O0-NEXT: add.s32 %r3, %r2, %r1; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; + %mul = mul nuw i16 %a, %b + %zext = zext i16 %mul to i32 + %add = add i32 %zext, %c + ret i32 %add +} + +define i64 @t39(i16 %a, i16 %b) { +; O1-LABEL: t39( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t39_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t39_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: cvt.u64.u16 %rd1, %rs3; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t39( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t39_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t39_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.u64.u16 %rd1, %rs3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = mul i16 %a, %b + %zext = zext i16 %mul to i64 + ret i64 %zext +} + +define i64 @t40(i16 %a, i16 %b) { +; O1-LABEL: t40( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t40_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t40_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: cvt.u64.u16 %rd1, %rs3; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t40( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t40_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t40_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.u64.u16 %rd1, %rs3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = mul nuw i16 %a, %b + %zext = zext i16 %mul to i64 + ret i64 %zext +} + +define i64 @t41(i16 %a, i16 %b) { +; O1-LABEL: t41( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t41_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t41_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: cvt.s64.s16 %rd1, %rs3; +; O1-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-NEXT: ret; +; +; O0-LABEL: t41( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b16 %rs2, [t41_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t41_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: cvt.s64.s16 %rd1, %rs3; +; O0-NEXT: st.param.b64 [func_retval0], %rd1; +; O0-NEXT: ret; + %mul = mul nsw i16 %a, %b + %sext = sext i16 %mul to i64 + ret i64 %sext +} + +define i32 @t42(i16 %a, i16 %b, ptr %ptr) { +; O1-LABEL: t42( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b32 %r<2>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t42_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t42_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: ld.param.b64 %rd1, [t42_param_2]; +; O1-NEXT: st.b16 [%rd1], %rs3; +; O1-NEXT: cvt.u32.u16 %r1, %rs3; +; O1-NEXT: st.param.b32 [func_retval0], %r1; +; O1-NEXT: ret; +; +; O0-LABEL: t42( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [t42_param_2]; +; O0-NEXT: ld.param.b16 %rs2, [t42_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t42_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: st.b16 [%rd1], %rs3; +; O0-NEXT: cvt.u32.u16 %r1, %rs3; +; O0-NEXT: st.param.b32 [func_retval0], %r1; +; O0-NEXT: ret; + %mul = mul nuw i16 %a, %b + store i16 %mul, ptr %ptr + %zext = zext i16 %mul to i32 + ret i32 %zext +} + +define i32 @t43(i16 %a, i16 %b, i32 %c, ptr %ptr) { +; O1-LABEL: t43( +; O1: { +; O1-NEXT: .reg .b16 %rs<4>; +; O1-NEXT: .reg .b32 %r<4>; +; O1-NEXT: .reg .b64 %rd<2>; +; O1-EMPTY: +; O1-NEXT: // %bb.0: +; O1-NEXT: ld.param.b16 %rs1, [t43_param_0]; +; O1-NEXT: ld.param.b16 %rs2, [t43_param_1]; +; O1-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O1-NEXT: ld.param.b64 %rd1, [t43_param_3]; +; O1-NEXT: st.b16 [%rd1], %rs3; +; O1-NEXT: ld.param.b32 %r1, [t43_param_2]; +; O1-NEXT: cvt.u32.u16 %r2, %rs3; +; O1-NEXT: add.s32 %r3, %r2, %r1; +; O1-NEXT: st.param.b32 [func_retval0], %r3; +; O1-NEXT: ret; +; +; O0-LABEL: t43( +; O0: { +; O0-NEXT: .reg .b16 %rs<4>; +; O0-NEXT: .reg .b32 %r<4>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [t43_param_3]; +; O0-NEXT: ld.param.b32 %r1, [t43_param_2]; +; O0-NEXT: ld.param.b16 %rs2, [t43_param_1]; +; O0-NEXT: ld.param.b16 %rs1, [t43_param_0]; +; O0-NEXT: mul.lo.s16 %rs3, %rs1, %rs2; +; O0-NEXT: st.b16 [%rd1], %rs3; +; O0-NEXT: cvt.u32.u16 %r2, %rs3; +; O0-NEXT: add.s32 %r3, %r2, %r1; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; + %mul = mul nuw i16 %a, %b + store i16 %mul, ptr %ptr + %zext = zext i16 %mul to i32 + %add = add i32 %zext, %c + ret i32 %add +} diff --git a/llvm/test/CodeGen/NVPTX/fold-movs.ll b/llvm/test/CodeGen/NVPTX/fold-movs.ll new file mode 100644 index 0000000..6ee0fb2 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fold-movs.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \ +; RUN: -frame-pointer=all -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefixes=CHECK-F32X2 +; RUN: %if ptxas-12.7 %{ \ +; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \ +; RUN: -frame-pointer=all -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ +; RUN: %} +target triple = "nvptx64-nvidia-cuda" + +; Since fdiv doesn't support f32x2, this will create BUILD_VECTORs that will be +; folded into the store, turning it into st.global.v8.b32. +define void @writevec(<8 x float> %v1, <8 x float> %v2, ptr addrspace(1) %p) { +; CHECK-F32X2-LABEL: writevec( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<25>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [writevec_param_0]; +; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [writevec_param_0+16]; +; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [writevec_param_1+16]; +; CHECK-F32X2-NEXT: div.rn.f32 %r13, %r8, %r12; +; CHECK-F32X2-NEXT: div.rn.f32 %r14, %r7, %r11; +; CHECK-F32X2-NEXT: div.rn.f32 %r15, %r6, %r10; +; CHECK-F32X2-NEXT: div.rn.f32 %r16, %r5, %r9; +; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [writevec_param_1]; +; CHECK-F32X2-NEXT: div.rn.f32 %r21, %r4, %r20; +; CHECK-F32X2-NEXT: div.rn.f32 %r22, %r3, %r19; +; CHECK-F32X2-NEXT: div.rn.f32 %r23, %r2, %r18; +; CHECK-F32X2-NEXT: div.rn.f32 %r24, %r1, %r17; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [writevec_param_2]; +; CHECK-F32X2-NEXT: st.global.v8.b32 [%rd1], {%r24, %r23, %r22, %r21, %r16, %r15, %r14, %r13}; +; CHECK-F32X2-NEXT: ret; + %v = fdiv <8 x float> %v1, %v2 + store <8 x float> %v, ptr addrspace(1) %p, align 32 + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 06c2cc8..26336b8 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -343,61 +343,77 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_smax( ; O0: { ; O0-NEXT: .reg .pred %p<5>; -; O0-NEXT: .reg .b32 %r<18>; +; O0-NEXT: .reg .b32 %r<26>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_smax_param_1]; ; O0-NEXT: ld.param.b32 %r1, [test_smax_param_0]; -; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U; ; O0-NEXT: setp.gt.s32 %p1, %r4, %r3; -; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U; +; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U; ; O0-NEXT: setp.gt.s32 %p2, %r6, %r5; -; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; +; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U; ; O0-NEXT: setp.gt.s32 %p3, %r8, %r7; -; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U; +; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U; ; O0-NEXT: setp.gt.s32 %p4, %r10, %r9; -; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U; +; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3; +; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; +; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U; +; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1; +; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; +; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r25; ; O0-NEXT: ret; ; ; O3-LABEL: test_smax( ; O3: { ; O3-NEXT: .reg .pred %p<5>; -; O3-NEXT: .reg .b32 %r<18>; +; O3-NEXT: .reg .b32 %r<26>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_smax_param_0]; ; O3-NEXT: ld.param.b32 %r2, [test_smax_param_1]; -; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U; ; O3-NEXT: setp.gt.s32 %p1, %r4, %r3; -; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U; ; O3-NEXT: setp.gt.s32 %p2, %r6, %r5; -; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U; ; O3-NEXT: setp.gt.s32 %p3, %r8, %r7; -; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U; ; O3-NEXT: setp.gt.s32 %p4, %r10, %r9; -; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U; +; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3; +; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; +; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U; +; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1; +; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; +; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r25; ; O3-NEXT: ret; %cmp = icmp sgt <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b @@ -473,61 +489,77 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; O0-LABEL: test_smin( ; O0: { ; O0-NEXT: .reg .pred %p<5>; -; O0-NEXT: .reg .b32 %r<18>; +; O0-NEXT: .reg .b32 %r<26>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: ; O0-NEXT: ld.param.b32 %r2, [test_smin_param_1]; ; O0-NEXT: ld.param.b32 %r1, [test_smin_param_0]; -; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U; ; O0-NEXT: setp.le.s32 %p1, %r4, %r3; -; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U; +; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U; ; O0-NEXT: setp.le.s32 %p2, %r6, %r5; -; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; +; O0-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U; ; O0-NEXT: setp.le.s32 %p3, %r8, %r7; -; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U; +; O0-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U; ; O0-NEXT: setp.le.s32 %p4, %r10, %r9; -; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U; +; O0-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O0-NEXT: selp.b32 %r18, %r17, %r13, %p3; +; O0-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; +; O0-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U; +; O0-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O0-NEXT: selp.b32 %r23, %r22, %r11, %p1; +; O0-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; +; O0-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r25; ; O0-NEXT: ret; ; ; O3-LABEL: test_smin( ; O3: { ; O3-NEXT: .reg .pred %p<5>; -; O3-NEXT: .reg .b32 %r<18>; +; O3-NEXT: .reg .b32 %r<26>; ; O3-EMPTY: ; O3-NEXT: // %bb.0: ; O3-NEXT: ld.param.b32 %r1, [test_smin_param_0]; ; O3-NEXT: ld.param.b32 %r2, [test_smin_param_1]; -; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x8880U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x8880U; ; O3-NEXT: setp.le.s32 %p1, %r4, %r3; -; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x9991U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U; ; O3-NEXT: setp.le.s32 %p2, %r6, %r5; -; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0xaaa2U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0xaaa2U; ; O3-NEXT: setp.le.s32 %p3, %r8, %r7; -; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0xbbb3U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0xbbb3U; ; O3-NEXT: setp.le.s32 %p4, %r10, %r9; -; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: prmt.b32 %r11, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r12, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r14, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7773U; +; O3-NEXT: selp.b32 %r16, %r15, %r14, %p4; +; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O3-NEXT: selp.b32 %r18, %r17, %r13, %p3; +; O3-NEXT: prmt.b32 %r19, %r18, %r16, 0x3340U; +; O3-NEXT: prmt.b32 %r20, %r1, 0, 0x7771U; +; O3-NEXT: selp.b32 %r21, %r20, %r12, %p2; +; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O3-NEXT: selp.b32 %r23, %r22, %r11, %p1; +; O3-NEXT: prmt.b32 %r24, %r23, %r21, 0x3340U; +; O3-NEXT: prmt.b32 %r25, %r24, %r19, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r25; ; O3-NEXT: ret; %cmp = icmp sle <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b diff --git a/llvm/test/CodeGen/NVPTX/ld-param-sink.ll b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll new file mode 100644 index 0000000..03523a3 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/ld-param-sink.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -verify-machineinstrs | FileCheck %s +; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +declare ptr @bar(i64) +declare i64 @baz() + +define ptr @foo(i1 %cond) { +; CHECK-LABEL: foo( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld.param.b8 %rs1, [foo_param_0]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: { // callseq 0, 0 +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: call.uni (retval0), baz, (); +; CHECK-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-NEXT: } // callseq 0 +; CHECK-NEXT: @%p1 bra $L__BB0_2; +; CHECK-NEXT: // %bb.1: // %bb +; CHECK-NEXT: { // callseq 1, 0 +; CHECK-NEXT: .param .b64 param0; +; CHECK-NEXT: .param .b64 retval0; +; CHECK-NEXT: st.param.b64 [param0], %rd2; +; CHECK-NEXT: call.uni (retval0), bar, (param0); +; CHECK-NEXT: } // callseq 1 +; CHECK-NEXT: $L__BB0_2: // %common.ret +; CHECK-NEXT: st.param.b64 [func_retval0], 0; +; CHECK-NEXT: ret; +entry: + %call = call i64 @baz() + br i1 %cond, label %common.ret, label %bb + +bb: + %tmp = call ptr @bar(i64 %call) + br label %common.ret + +common.ret: + ret ptr null +} diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll index 5c30173..ae069cf 100644 --- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll @@ -114,15 +114,14 @@ define void @foo3(i32 %a) { ; PTX64-NEXT: .reg .b64 %SP; ; PTX64-NEXT: .reg .b64 %SPL; ; PTX64-NEXT: .reg .b32 %r<2>; -; PTX64-NEXT: .reg .b64 %rd<5>; +; PTX64-NEXT: .reg .b64 %rd<4>; ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot2; ; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0]; ; PTX64-NEXT: add.u64 %rd2, %SPL, 0; -; PTX64-NEXT: mul.wide.s32 %rd3, %r1, 4; -; PTX64-NEXT: add.s64 %rd4, %rd2, %rd3; -; PTX64-NEXT: st.local.b32 [%rd4], %r1; +; PTX64-NEXT: mad.wide.s32 %rd3, %r1, 4, %rd2; +; PTX64-NEXT: st.local.b32 [%rd3], %r1; ; PTX64-NEXT: ret; %local = alloca [3 x i32], align 4 %1 = getelementptr inbounds i32, ptr %local, i32 %a diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll index e16fc74..6f0dff7 100644 --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -154,7 +154,7 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177 ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [foo_complex_param_0]; @@ -166,12 +166,11 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177 ; CHECK-NEXT: shl.b32 %r6, %r1, 1; ; CHECK-NEXT: or.b32 %r7, %r5, %r6; ; CHECK-NEXT: cvt.u64.u32 %rd2, %r7; -; CHECK-NEXT: mul.wide.u32 %rd3, %r3, 131072; -; CHECK-NEXT: add.s64 %rd4, %rd1, %rd3; -; CHECK-NEXT: add.s64 %rd5, %rd4, %rd2; -; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd5+128]; +; CHECK-NEXT: mad.wide.u32 %rd3, %r3, 131072, %rd1; +; CHECK-NEXT: add.s64 %rd4, %rd3, %rd2; +; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd4+128]; ; CHECK-NEXT: max.u16 %rs3, %rs1, %rs2; -; CHECK-NEXT: st.b8 [%rd5+129], %rs3; +; CHECK-NEXT: st.b8 [%rd4+129], %rs3; ; CHECK-NEXT: ret; %t0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !1 %t1 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() |